aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile3
-rw-r--r--drivers/md/bitmap.c1586
-rw-r--r--drivers/md/dm-crypt.c3
-rw-r--r--drivers/md/dm-ioctl.c14
-rw-r--r--drivers/md/dm-raid1.c1
-rw-r--r--drivers/md/linear.c3
-rw-r--r--drivers/md/md.c529
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid0.c12
-rw-r--r--drivers/md/raid1.c242
-rw-r--r--drivers/md/raid10.c30
-rw-r--r--drivers/md/raid5.c12
-rw-r--r--drivers/md/raid6main.c12
13 files changed, 2218 insertions, 232 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 90de9c146a5f..d3efedf6a6ad 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-log.o dm-raid1.o
10md-mod-objs := md.o bitmap.o
10raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ 11raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
11 raid6int1.o raid6int2.o raid6int4.o \ 12 raid6int1.o raid6int2.o raid6int4.o \
12 raid6int8.o raid6int16.o raid6int32.o \ 13 raid6int8.o raid6int16.o raid6int32.o \
@@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
28obj-$(CONFIG_MD_RAID6) += raid6.o xor.o 29obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
29obj-$(CONFIG_MD_MULTIPATH) += multipath.o 30obj-$(CONFIG_MD_MULTIPATH) += multipath.o
30obj-$(CONFIG_MD_FAULTY) += faulty.o 31obj-$(CONFIG_MD_FAULTY) += faulty.o
31obj-$(CONFIG_BLK_DEV_MD) += md.o 32obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o 33obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o 34obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o 35obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
new file mode 100644
index 000000000000..95980ad6b27b
--- /dev/null
+++ b/drivers/md/bitmap.c
@@ -0,0 +1,1586 @@
1/*
2 * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
3 *
4 * bitmap_create - sets up the bitmap structure
5 * bitmap_destroy - destroys the bitmap structure
6 *
7 * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
8 * - added disk storage for bitmap
9 * - changes to allow various bitmap chunk sizes
10 * - added bitmap daemon (to asynchronously clear bitmap bits from disk)
11 */
12
13/*
14 * Still to do:
15 *
16 * flush after percent set rather than just time based. (maybe both).
17 * wait if count gets too high, wake when it drops to half.
18 * allow bitmap to be mirrored with superblock (before or after...)
19 * allow hot-add to re-instate a current device.
20 * allow hot-add of bitmap after quiessing device
21 */
22
23#include <linux/module.h>
24#include <linux/version.h>
25#include <linux/errno.h>
26#include <linux/slab.h>
27#include <linux/init.h>
28#include <linux/config.h>
29#include <linux/timer.h>
30#include <linux/sched.h>
31#include <linux/list.h>
32#include <linux/file.h>
33#include <linux/mount.h>
34#include <linux/buffer_head.h>
35#include <linux/raid/md.h>
36#include <linux/raid/bitmap.h>
37
38/* debug macros */
39
40#define DEBUG 0
41
42#if DEBUG
43/* these are for debugging purposes only! */
44
45/* define one and only one of these */
46#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */
47#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/
48#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */
49#define INJECT_FAULTS_4 0 /* undef */
50#define INJECT_FAULTS_5 0 /* undef */
51#define INJECT_FAULTS_6 0
52
53/* if these are defined, the driver will fail! debug only */
54#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */
55#define INJECT_FATAL_FAULT_2 0 /* undef */
56#define INJECT_FATAL_FAULT_3 0 /* undef */
57#endif
58
59//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */
60#define DPRINTK(x...) do { } while(0)
61
62#ifndef PRINTK
63# if DEBUG > 0
64# define PRINTK(x...) printk(KERN_DEBUG x)
65# else
66# define PRINTK(x...)
67# endif
68#endif
69
70static inline char * bmname(struct bitmap *bitmap)
71{
72 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
73}
74
75
76/*
77 * test if the bitmap is active
78 */
79int bitmap_active(struct bitmap *bitmap)
80{
81 unsigned long flags;
82 int res = 0;
83
84 if (!bitmap)
85 return res;
86 spin_lock_irqsave(&bitmap->lock, flags);
87 res = bitmap->flags & BITMAP_ACTIVE;
88 spin_unlock_irqrestore(&bitmap->lock, flags);
89 return res;
90}
91
92#define WRITE_POOL_SIZE 256
93/* mempool for queueing pending writes on the bitmap file */
94static void *write_pool_alloc(unsigned int gfp_flags, void *data)
95{
96 return kmalloc(sizeof(struct page_list), gfp_flags);
97}
98
99static void write_pool_free(void *ptr, void *data)
100{
101 kfree(ptr);
102}
103
104/*
105 * just a placeholder - calls kmalloc for bitmap pages
106 */
107static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
108{
109 unsigned char *page;
110
111#if INJECT_FAULTS_1
112 page = NULL;
113#else
114 page = kmalloc(PAGE_SIZE, GFP_NOIO);
115#endif
116 if (!page)
117 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
118 else
119 PRINTK("%s: bitmap_alloc_page: allocated page at %p\n",
120 bmname(bitmap), page);
121 return page;
122}
123
124/*
125 * for now just a placeholder -- just calls kfree for bitmap pages
126 */
127static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
128{
129 PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
130 kfree(page);
131}
132
133/*
134 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
135 *
136 * 1) check to see if this page is allocated, if it's not then try to alloc
137 * 2) if the alloc fails, set the page's hijacked flag so we'll use the
138 * page pointer directly as a counter
139 *
140 * if we find our page, we increment the page's refcount so that it stays
141 * allocated while we're using it
142 */
143static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create)
144{
145 unsigned char *mappage;
146
147 if (page >= bitmap->pages) {
148 printk(KERN_ALERT
149 "%s: invalid bitmap page request: %lu (> %lu)\n",
150 bmname(bitmap), page, bitmap->pages-1);
151 return -EINVAL;
152 }
153
154
155 if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
156 return 0;
157
158 if (bitmap->bp[page].map) /* page is already allocated, just return */
159 return 0;
160
161 if (!create)
162 return -ENOENT;
163
164 spin_unlock_irq(&bitmap->lock);
165
166 /* this page has not been allocated yet */
167
168 if ((mappage = bitmap_alloc_page(bitmap)) == NULL) {
169 PRINTK("%s: bitmap map page allocation failed, hijacking\n",
170 bmname(bitmap));
171 /* failed - set the hijacked flag so that we can use the
172 * pointer as a counter */
173 spin_lock_irq(&bitmap->lock);
174 if (!bitmap->bp[page].map)
175 bitmap->bp[page].hijacked = 1;
176 goto out;
177 }
178
179 /* got a page */
180
181 spin_lock_irq(&bitmap->lock);
182
183 /* recheck the page */
184
185 if (bitmap->bp[page].map || bitmap->bp[page].hijacked) {
186 /* somebody beat us to getting the page */
187 bitmap_free_page(bitmap, mappage);
188 return 0;
189 }
190
191 /* no page was in place and we have one, so install it */
192
193 memset(mappage, 0, PAGE_SIZE);
194 bitmap->bp[page].map = mappage;
195 bitmap->missing_pages--;
196out:
197 return 0;
198}
199
200
201/* if page is completely empty, put it back on the free list, or dealloc it */
202/* if page was hijacked, unmark the flag so it might get alloced next time */
203/* Note: lock should be held when calling this */
204static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
205{
206 char *ptr;
207
208 if (bitmap->bp[page].count) /* page is still busy */
209 return;
210
211 /* page is no longer in use, it can be released */
212
213 if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */
214 bitmap->bp[page].hijacked = 0;
215 bitmap->bp[page].map = NULL;
216 return;
217 }
218
219 /* normal case, free the page */
220
221#if 0
222/* actually ... let's not. We will probably need the page again exactly when
223 * memory is tight and we are flusing to disk
224 */
225 return;
226#else
227 ptr = bitmap->bp[page].map;
228 bitmap->bp[page].map = NULL;
229 bitmap->missing_pages++;
230 bitmap_free_page(bitmap, ptr);
231 return;
232#endif
233}
234
235
236/*
237 * bitmap file handling - read and write the bitmap file and its superblock
238 */
239
240/* copy the pathname of a file to a buffer */
241char *file_path(struct file *file, char *buf, int count)
242{
243 struct dentry *d;
244 struct vfsmount *v;
245
246 if (!buf)
247 return NULL;
248
249 d = file->f_dentry;
250 v = file->f_vfsmnt;
251
252 buf = d_path(d, v, buf, count);
253
254 return IS_ERR(buf) ? NULL : buf;
255}
256
257/*
258 * basic page I/O operations
259 */
260
261/* IO operations when bitmap is stored near all superblocks */
262static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index)
263{
264 /* choose a good rdev and read the page from there */
265
266 mdk_rdev_t *rdev;
267 struct list_head *tmp;
268 struct page *page = alloc_page(GFP_KERNEL);
269 sector_t target;
270
271 if (!page)
272 return ERR_PTR(-ENOMEM);
273 do {
274 ITERATE_RDEV(mddev, rdev, tmp)
275 if (rdev->in_sync && !rdev->faulty)
276 goto found;
277 return ERR_PTR(-EIO);
278
279 found:
280 target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512);
281
282 } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ));
283
284 page->index = index;
285 return page;
286}
287
288static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
289{
290 mdk_rdev_t *rdev;
291 struct list_head *tmp;
292
293 ITERATE_RDEV(mddev, rdev, tmp)
294 if (rdev->in_sync && !rdev->faulty)
295 md_super_write(mddev, rdev,
296 (rdev->sb_offset<<1) + offset
297 + page->index * (PAGE_SIZE/512),
298 PAGE_SIZE,
299 page);
300
301 if (wait)
302 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
303 return 0;
304}
305
306/*
307 * write out a page to a file
308 */
309static int write_page(struct bitmap *bitmap, struct page *page, int wait)
310{
311 int ret = -ENOMEM;
312
313 if (bitmap->file == NULL)
314 return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
315
316 if (wait)
317 lock_page(page);
318 else {
319 if (TestSetPageLocked(page))
320 return -EAGAIN; /* already locked */
321 if (PageWriteback(page)) {
322 unlock_page(page);
323 return -EAGAIN;
324 }
325 }
326
327 ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE);
328 if (!ret)
329 ret = page->mapping->a_ops->commit_write(NULL, page, 0,
330 PAGE_SIZE);
331 if (ret) {
332 unlock_page(page);
333 return ret;
334 }
335
336 set_page_dirty(page); /* force it to be written out */
337
338 if (!wait) {
339 /* add to list to be waited for by daemon */
340 struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO);
341 item->page = page;
342 page_cache_get(page);
343 spin_lock(&bitmap->write_lock);
344 list_add(&item->list, &bitmap->complete_pages);
345 spin_unlock(&bitmap->write_lock);
346 md_wakeup_thread(bitmap->writeback_daemon);
347 }
348 return write_one_page(page, wait);
349}
350
351/* read a page from a file, pinning it into cache, and return bytes_read */
352static struct page *read_page(struct file *file, unsigned long index,
353 unsigned long *bytes_read)
354{
355 struct inode *inode = file->f_mapping->host;
356 struct page *page = NULL;
357 loff_t isize = i_size_read(inode);
358 unsigned long end_index = isize >> PAGE_CACHE_SHIFT;
359
360 PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE,
361 (unsigned long long)index << PAGE_CACHE_SHIFT);
362
363 page = read_cache_page(inode->i_mapping, index,
364 (filler_t *)inode->i_mapping->a_ops->readpage, file);
365 if (IS_ERR(page))
366 goto out;
367 wait_on_page_locked(page);
368 if (!PageUptodate(page) || PageError(page)) {
369 page_cache_release(page);
370 page = ERR_PTR(-EIO);
371 goto out;
372 }
373
374 if (index > end_index) /* we have read beyond EOF */
375 *bytes_read = 0;
376 else if (index == end_index) /* possible short read */
377 *bytes_read = isize & ~PAGE_CACHE_MASK;
378 else
379 *bytes_read = PAGE_CACHE_SIZE; /* got a full page */
380out:
381 if (IS_ERR(page))
382 printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n",
383 (int)PAGE_CACHE_SIZE,
384 (unsigned long long)index << PAGE_CACHE_SHIFT,
385 PTR_ERR(page));
386 return page;
387}
388
389/*
390 * bitmap file superblock operations
391 */
392
393/* update the event counter and sync the superblock to disk */
394int bitmap_update_sb(struct bitmap *bitmap)
395{
396 bitmap_super_t *sb;
397 unsigned long flags;
398
399 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
400 return 0;
401 spin_lock_irqsave(&bitmap->lock, flags);
402 if (!bitmap->sb_page) { /* no superblock */
403 spin_unlock_irqrestore(&bitmap->lock, flags);
404 return 0;
405 }
406 spin_unlock_irqrestore(&bitmap->lock, flags);
407 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
408 sb->events = cpu_to_le64(bitmap->mddev->events);
409 if (!bitmap->mddev->degraded)
410 sb->events_cleared = cpu_to_le64(bitmap->mddev->events);
411 kunmap(bitmap->sb_page);
412 return write_page(bitmap, bitmap->sb_page, 1);
413}
414
415/* print out the bitmap file superblock */
416void bitmap_print_sb(struct bitmap *bitmap)
417{
418 bitmap_super_t *sb;
419
420 if (!bitmap || !bitmap->sb_page)
421 return;
422 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
423 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
424 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
425 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
426 printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n",
427 *(__u32 *)(sb->uuid+0),
428 *(__u32 *)(sb->uuid+4),
429 *(__u32 *)(sb->uuid+8),
430 *(__u32 *)(sb->uuid+12));
431 printk(KERN_DEBUG " events: %llu\n",
432 (unsigned long long) le64_to_cpu(sb->events));
433 printk(KERN_DEBUG "events cleared: %llu\n",
434 (unsigned long long) le64_to_cpu(sb->events_cleared));
435 printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state));
436 printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize));
437 printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
438 printk(KERN_DEBUG " sync size: %llu KB\n",
439 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
440 kunmap(bitmap->sb_page);
441}
442
443/* read the superblock from the bitmap file and initialize some bitmap fields */
444static int bitmap_read_sb(struct bitmap *bitmap)
445{
446 char *reason = NULL;
447 bitmap_super_t *sb;
448 unsigned long chunksize, daemon_sleep;
449 unsigned long bytes_read;
450 unsigned long long events;
451 int err = -EINVAL;
452
453 /* page 0 is the superblock, read it... */
454 if (bitmap->file)
455 bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read);
456 else {
457 bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0);
458 bytes_read = PAGE_SIZE;
459 }
460 if (IS_ERR(bitmap->sb_page)) {
461 err = PTR_ERR(bitmap->sb_page);
462 bitmap->sb_page = NULL;
463 return err;
464 }
465
466 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
467
468 if (bytes_read < sizeof(*sb)) { /* short read */
469 printk(KERN_INFO "%s: bitmap file superblock truncated\n",
470 bmname(bitmap));
471 err = -ENOSPC;
472 goto out;
473 }
474
475 chunksize = le32_to_cpu(sb->chunksize);
476 daemon_sleep = le32_to_cpu(sb->daemon_sleep);
477
478 /* verify that the bitmap-specific fields are valid */
479 if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
480 reason = "bad magic";
481 else if (sb->version != cpu_to_le32(BITMAP_MAJOR))
482 reason = "unrecognized superblock version";
483 else if (chunksize < 512 || chunksize > (1024 * 1024 * 4))
484 reason = "bitmap chunksize out of range (512B - 4MB)";
485 else if ((1 << ffz(~chunksize)) != chunksize)
486 reason = "bitmap chunksize not a power of 2";
487 else if (daemon_sleep < 1 || daemon_sleep > 15)
488 reason = "daemon sleep period out of range";
489 if (reason) {
490 printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n",
491 bmname(bitmap), reason);
492 goto out;
493 }
494
495 /* keep the array size field of the bitmap superblock up to date */
496 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
497
498 if (!bitmap->mddev->persistent)
499 goto success;
500
501 /*
502 * if we have a persistent array superblock, compare the
503 * bitmap's UUID and event counter to the mddev's
504 */
505 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
506 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n",
507 bmname(bitmap));
508 goto out;
509 }
510 events = le64_to_cpu(sb->events);
511 if (events < bitmap->mddev->events) {
512 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) "
513 "-- forcing full recovery\n", bmname(bitmap), events,
514 (unsigned long long) bitmap->mddev->events);
515 sb->state |= BITMAP_STALE;
516 }
517success:
518 /* assign fields using values from superblock */
519 bitmap->chunksize = chunksize;
520 bitmap->daemon_sleep = daemon_sleep;
521 bitmap->flags |= sb->state;
522 bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
523 err = 0;
524out:
525 kunmap(bitmap->sb_page);
526 if (err)
527 bitmap_print_sb(bitmap);
528 return err;
529}
530
531enum bitmap_mask_op {
532 MASK_SET,
533 MASK_UNSET
534};
535
536/* record the state of the bitmap in the superblock */
537static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
538 enum bitmap_mask_op op)
539{
540 bitmap_super_t *sb;
541 unsigned long flags;
542
543 spin_lock_irqsave(&bitmap->lock, flags);
544 if (!bitmap || !bitmap->sb_page) { /* can't set the state */
545 spin_unlock_irqrestore(&bitmap->lock, flags);
546 return;
547 }
548 page_cache_get(bitmap->sb_page);
549 spin_unlock_irqrestore(&bitmap->lock, flags);
550 sb = (bitmap_super_t *)kmap(bitmap->sb_page);
551 switch (op) {
552 case MASK_SET: sb->state |= bits;
553 break;
554 case MASK_UNSET: sb->state &= ~bits;
555 break;
556 default: BUG();
557 }
558 kunmap(bitmap->sb_page);
559 page_cache_release(bitmap->sb_page);
560}
561
562/*
563 * general bitmap file operations
564 */
565
566/* calculate the index of the page that contains this bit */
567static inline unsigned long file_page_index(unsigned long chunk)
568{
569 return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT;
570}
571
572/* calculate the (bit) offset of this bit within a page */
573static inline unsigned long file_page_offset(unsigned long chunk)
574{
575 return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1);
576}
577
578/*
579 * return a pointer to the page in the filemap that contains the given bit
580 *
581 * this lookup is complicated by the fact that the bitmap sb might be exactly
582 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
583 * 0 or page 1
584 */
585static inline struct page *filemap_get_page(struct bitmap *bitmap,
586 unsigned long chunk)
587{
588 return bitmap->filemap[file_page_index(chunk) - file_page_index(0)];
589}
590
591
592static void bitmap_file_unmap(struct bitmap *bitmap)
593{
594 struct page **map, *sb_page;
595 unsigned long *attr;
596 int pages;
597 unsigned long flags;
598
599 spin_lock_irqsave(&bitmap->lock, flags);
600 map = bitmap->filemap;
601 bitmap->filemap = NULL;
602 attr = bitmap->filemap_attr;
603 bitmap->filemap_attr = NULL;
604 pages = bitmap->file_pages;
605 bitmap->file_pages = 0;
606 sb_page = bitmap->sb_page;
607 bitmap->sb_page = NULL;
608 spin_unlock_irqrestore(&bitmap->lock, flags);
609
610 while (pages--)
611 if (map[pages]->index != 0) /* 0 is sb_page, release it below */
612 page_cache_release(map[pages]);
613 kfree(map);
614 kfree(attr);
615
616 if (sb_page)
617 page_cache_release(sb_page);
618}
619
620static void bitmap_stop_daemons(struct bitmap *bitmap);
621
622/* dequeue the next item in a page list -- don't call from irq context */
623static struct page_list *dequeue_page(struct bitmap *bitmap)
624{
625 struct page_list *item = NULL;
626 struct list_head *head = &bitmap->complete_pages;
627
628 spin_lock(&bitmap->write_lock);
629 if (list_empty(head))
630 goto out;
631 item = list_entry(head->prev, struct page_list, list);
632 list_del(head->prev);
633out:
634 spin_unlock(&bitmap->write_lock);
635 return item;
636}
637
638static void drain_write_queues(struct bitmap *bitmap)
639{
640 struct page_list *item;
641
642 while ((item = dequeue_page(bitmap))) {
643 /* don't bother to wait */
644 page_cache_release(item->page);
645 mempool_free(item, bitmap->write_pool);
646 }
647
648 wake_up(&bitmap->write_wait);
649}
650
651static void bitmap_file_put(struct bitmap *bitmap)
652{
653 struct file *file;
654 struct inode *inode;
655 unsigned long flags;
656
657 spin_lock_irqsave(&bitmap->lock, flags);
658 file = bitmap->file;
659 bitmap->file = NULL;
660 spin_unlock_irqrestore(&bitmap->lock, flags);
661
662 bitmap_stop_daemons(bitmap);
663
664 drain_write_queues(bitmap);
665
666 bitmap_file_unmap(bitmap);
667
668 if (file) {
669 inode = file->f_mapping->host;
670 spin_lock(&inode->i_lock);
671 atomic_set(&inode->i_writecount, 1); /* allow writes again */
672 spin_unlock(&inode->i_lock);
673 fput(file);
674 }
675}
676
677
678/*
679 * bitmap_file_kick - if an error occurs while manipulating the bitmap file
680 * then it is no longer reliable, so we stop using it and we mark the file
681 * as failed in the superblock
682 */
683static void bitmap_file_kick(struct bitmap *bitmap)
684{
685 char *path, *ptr = NULL;
686
687 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET);
688 bitmap_update_sb(bitmap);
689
690 if (bitmap->file) {
691 path = kmalloc(PAGE_SIZE, GFP_KERNEL);
692 if (path)
693 ptr = file_path(bitmap->file, path, PAGE_SIZE);
694
695 printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n",
696 bmname(bitmap), ptr ? ptr : "");
697
698 kfree(path);
699 }
700
701 bitmap_file_put(bitmap);
702
703 return;
704}
705
706enum bitmap_page_attr {
707 BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced
708 BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared
709 BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced
710};
711
712static inline void set_page_attr(struct bitmap *bitmap, struct page *page,
713 enum bitmap_page_attr attr)
714{
715 bitmap->filemap_attr[page->index] |= attr;
716}
717
718static inline void clear_page_attr(struct bitmap *bitmap, struct page *page,
719 enum bitmap_page_attr attr)
720{
721 bitmap->filemap_attr[page->index] &= ~attr;
722}
723
724static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page)
725{
726 return bitmap->filemap_attr[page->index];
727}
728
729/*
730 * bitmap_file_set_bit -- called before performing a write to the md device
731 * to set (and eventually sync) a particular bit in the bitmap file
732 *
733 * we set the bit immediately, then we record the page number so that
734 * when an unplug occurs, we can flush the dirty pages out to disk
735 */
736static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
737{
738 unsigned long bit;
739 struct page *page;
740 void *kaddr;
741 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap);
742
743 if (!bitmap->filemap) {
744 return;
745 }
746
747 page = filemap_get_page(bitmap, chunk);
748 bit = file_page_offset(chunk);
749
750
751 /* make sure the page stays cached until it gets written out */
752 if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY))
753 page_cache_get(page);
754
755 /* set the bit */
756 kaddr = kmap_atomic(page, KM_USER0);
757 set_bit(bit, kaddr);
758 kunmap_atomic(kaddr, KM_USER0);
759 PRINTK("set file bit %lu page %lu\n", bit, page->index);
760
761 /* record page number so it gets flushed to disk when unplug occurs */
762 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
763
764}
765
766/* this gets called when the md device is ready to unplug its underlying
767 * (slave) device queues -- before we let any writes go down, we need to
768 * sync the dirty pages of the bitmap file to disk */
769int bitmap_unplug(struct bitmap *bitmap)
770{
771 unsigned long i, attr, flags;
772 struct page *page;
773 int wait = 0;
774 int err;
775
776 if (!bitmap)
777 return 0;
778
779 /* look at each page to see if there are any set bits that need to be
780 * flushed out to disk */
781 for (i = 0; i < bitmap->file_pages; i++) {
782 spin_lock_irqsave(&bitmap->lock, flags);
783 if (!bitmap->filemap) {
784 spin_unlock_irqrestore(&bitmap->lock, flags);
785 return 0;
786 }
787 page = bitmap->filemap[i];
788 attr = get_page_attr(bitmap, page);
789 clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
790 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
791 if ((attr & BITMAP_PAGE_DIRTY))
792 wait = 1;
793 spin_unlock_irqrestore(&bitmap->lock, flags);
794
795 if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) {
796 err = write_page(bitmap, page, 0);
797 if (err == -EAGAIN) {
798 if (attr & BITMAP_PAGE_DIRTY)
799 err = write_page(bitmap, page, 1);
800 else
801 err = 0;
802 }
803 if (err)
804 return 1;
805 }
806 }
807 if (wait) { /* if any writes were performed, we need to wait on them */
808 if (bitmap->file) {
809 spin_lock_irq(&bitmap->write_lock);
810 wait_event_lock_irq(bitmap->write_wait,
811 list_empty(&bitmap->complete_pages), bitmap->write_lock,
812 wake_up_process(bitmap->writeback_daemon->tsk));
813 spin_unlock_irq(&bitmap->write_lock);
814 } else
815 wait_event(bitmap->mddev->sb_wait,
816 atomic_read(&bitmap->mddev->pending_writes)==0);
817 }
818 return 0;
819}
820
821static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
822 unsigned long sectors, int in_sync);
823/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
824 * the in-memory bitmap from the on-disk bitmap -- also, sets up the
825 * memory mapping of the bitmap file
826 * Special cases:
827 * if there's no bitmap file, or if the bitmap file had been
828 * previously kicked from the array, we mark all the bits as
829 * 1's in order to cause a full resync.
830 */
831static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync)
832{
833 unsigned long i, chunks, index, oldindex, bit;
834 struct page *page = NULL, *oldpage = NULL;
835 unsigned long num_pages, bit_cnt = 0;
836 struct file *file;
837 unsigned long bytes, offset, dummy;
838 int outofdate;
839 int ret = -ENOSPC;
840
841 chunks = bitmap->chunks;
842 file = bitmap->file;
843
844 BUG_ON(!file && !bitmap->offset);
845
846#if INJECT_FAULTS_3
847 outofdate = 1;
848#else
849 outofdate = bitmap->flags & BITMAP_STALE;
850#endif
851 if (outofdate)
852 printk(KERN_INFO "%s: bitmap file is out of date, doing full "
853 "recovery\n", bmname(bitmap));
854
855 bytes = (chunks + 7) / 8;
856
857 num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE;
858
859 if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) {
860 printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n",
861 bmname(bitmap),
862 (unsigned long) i_size_read(file->f_mapping->host),
863 bytes + sizeof(bitmap_super_t));
864 goto out;
865 }
866
867 ret = -ENOMEM;
868
869 bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
870 if (!bitmap->filemap)
871 goto out;
872
873 bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL);
874 if (!bitmap->filemap_attr)
875 goto out;
876
877 memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages);
878
879 oldindex = ~0L;
880
881 for (i = 0; i < chunks; i++) {
882 index = file_page_index(i);
883 bit = file_page_offset(i);
884 if (index != oldindex) { /* this is a new page, read it in */
885 /* unmap the old page, we're done with it */
886 if (oldpage != NULL)
887 kunmap(oldpage);
888 if (index == 0) {
889 /*
890 * if we're here then the superblock page
891 * contains some bits (PAGE_SIZE != sizeof sb)
892 * we've already read it in, so just use it
893 */
894 page = bitmap->sb_page;
895 offset = sizeof(bitmap_super_t);
896 } else if (file) {
897 page = read_page(file, index, &dummy);
898 offset = 0;
899 } else {
900 page = read_sb_page(bitmap->mddev, bitmap->offset, index);
901 offset = 0;
902 }
903 if (IS_ERR(page)) { /* read error */
904 ret = PTR_ERR(page);
905 goto out;
906 }
907
908 oldindex = index;
909 oldpage = page;
910 kmap(page);
911
912 if (outofdate) {
913 /*
914 * if bitmap is out of date, dirty the
915 * whole page and write it out
916 */
917 memset(page_address(page) + offset, 0xff,
918 PAGE_SIZE - offset);
919 ret = write_page(bitmap, page, 1);
920 if (ret) {
921 kunmap(page);
922 /* release, page not in filemap yet */
923 page_cache_release(page);
924 goto out;
925 }
926 }
927
928 bitmap->filemap[bitmap->file_pages++] = page;
929 }
930 if (test_bit(bit, page_address(page))) {
931 /* if the disk bit is set, set the memory bit */
932 bitmap_set_memory_bits(bitmap,
933 i << CHUNK_BLOCK_SHIFT(bitmap), 1, in_sync);
934 bit_cnt++;
935 }
936 }
937
938 /* everything went OK */
939 ret = 0;
940 bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET);
941
942 if (page) /* unmap the last page */
943 kunmap(page);
944
945 if (bit_cnt) { /* Kick recovery if any bits were set */
946 set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery);
947 md_wakeup_thread(bitmap->mddev->thread);
948 }
949
950out:
951 printk(KERN_INFO "%s: bitmap initialized from disk: "
952 "read %lu/%lu pages, set %lu bits, status: %d\n",
953 bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret);
954
955 return ret;
956}
957
958void bitmap_write_all(struct bitmap *bitmap)
959{
960 /* We don't actually write all bitmap blocks here,
961 * just flag them as needing to be written
962 */
963
964 unsigned long chunks = bitmap->chunks;
965 unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t);
966 unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE;
967 while (num_pages--)
968 bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE;
969}
970
971
972static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
973{
974 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
975 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
976 bitmap->bp[page].count += inc;
977/*
978 if (page == 0) printk("count page 0, offset %llu: %d gives %d\n",
979 (unsigned long long)offset, inc, bitmap->bp[page].count);
980*/
981 bitmap_checkfree(bitmap, page);
982}
983static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
984 sector_t offset, int *blocks,
985 int create);
986
987/*
988 * bitmap daemon -- periodically wakes up to clean bits and flush pages
989 * out to disk
990 */
991
992int bitmap_daemon_work(struct bitmap *bitmap)
993{
994 unsigned long j;
995 unsigned long flags;
996 struct page *page = NULL, *lastpage = NULL;
997 int err = 0;
998 int blocks;
999 int attr;
1000
1001 if (bitmap == NULL)
1002 return 0;
1003 if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ))
1004 return 0;
1005 bitmap->daemon_lastrun = jiffies;
1006
1007 for (j = 0; j < bitmap->chunks; j++) {
1008 bitmap_counter_t *bmc;
1009 spin_lock_irqsave(&bitmap->lock, flags);
1010 if (!bitmap->filemap) {
1011 /* error or shutdown */
1012 spin_unlock_irqrestore(&bitmap->lock, flags);
1013 break;
1014 }
1015
1016 page = filemap_get_page(bitmap, j);
1017
1018 if (page != lastpage) {
1019 /* skip this page unless it's marked as needing cleaning */
1020 if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) {
1021 if (attr & BITMAP_PAGE_NEEDWRITE) {
1022 page_cache_get(page);
1023 clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1024 }
1025 spin_unlock_irqrestore(&bitmap->lock, flags);
1026 if (attr & BITMAP_PAGE_NEEDWRITE) {
1027 switch (write_page(bitmap, page, 0)) {
1028 case -EAGAIN:
1029 set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE);
1030 break;
1031 case 0:
1032 break;
1033 default:
1034 bitmap_file_kick(bitmap);
1035 }
1036 page_cache_release(page);
1037 }
1038 continue;
1039 }
1040
1041 /* grab the new page, sync and release the old */
1042 page_cache_get(page);
1043 if (lastpage != NULL) {
1044 if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) {
1045 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1046 spin_unlock_irqrestore(&bitmap->lock, flags);
1047 err = write_page(bitmap, lastpage, 0);
1048 if (err == -EAGAIN) {
1049 err = 0;
1050 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1051 }
1052 } else {
1053 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1054 spin_unlock_irqrestore(&bitmap->lock, flags);
1055 }
1056 kunmap(lastpage);
1057 page_cache_release(lastpage);
1058 if (err)
1059 bitmap_file_kick(bitmap);
1060 } else
1061 spin_unlock_irqrestore(&bitmap->lock, flags);
1062 lastpage = page;
1063 kmap(page);
1064/*
1065 printk("bitmap clean at page %lu\n", j);
1066*/
1067 spin_lock_irqsave(&bitmap->lock, flags);
1068 clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1069 }
1070 bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
1071 &blocks, 0);
1072 if (bmc) {
1073/*
1074 if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc);
1075*/
1076 if (*bmc == 2) {
1077 *bmc=1; /* maybe clear the bit next time */
1078 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1079 } else if (*bmc == 1) {
1080 /* we can clear the bit */
1081 *bmc = 0;
1082 bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap),
1083 -1);
1084
1085 /* clear the bit */
1086 clear_bit(file_page_offset(j), page_address(page));
1087 }
1088 }
1089 spin_unlock_irqrestore(&bitmap->lock, flags);
1090 }
1091
1092 /* now sync the final page */
1093 if (lastpage != NULL) {
1094 kunmap(lastpage);
1095 spin_lock_irqsave(&bitmap->lock, flags);
1096 if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) {
1097 clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1098 spin_unlock_irqrestore(&bitmap->lock, flags);
1099 err = write_page(bitmap, lastpage, 0);
1100 if (err == -EAGAIN) {
1101 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1102 err = 0;
1103 }
1104 } else {
1105 set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE);
1106 spin_unlock_irqrestore(&bitmap->lock, flags);
1107 }
1108
1109 page_cache_release(lastpage);
1110 }
1111
1112 return err;
1113}
1114
1115static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon)
1116{
1117 mdk_thread_t *dmn;
1118 unsigned long flags;
1119
1120 /* if no one is waiting on us, we'll free the md thread struct
1121 * and exit, otherwise we let the waiter clean things up */
1122 spin_lock_irqsave(&bitmap->lock, flags);
1123 if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */
1124 *daemon = NULL;
1125 spin_unlock_irqrestore(&bitmap->lock, flags);
1126 kfree(dmn);
1127 complete_and_exit(NULL, 0); /* do_exit not exported */
1128 }
1129 spin_unlock_irqrestore(&bitmap->lock, flags);
1130}
1131
1132static void bitmap_writeback_daemon(mddev_t *mddev)
1133{
1134 struct bitmap *bitmap = mddev->bitmap;
1135 struct page *page;
1136 struct page_list *item;
1137 int err = 0;
1138
1139 if (signal_pending(current)) {
1140 printk(KERN_INFO
1141 "%s: bitmap writeback daemon got signal, exiting...\n",
1142 bmname(bitmap));
1143 err = -EINTR;
1144 goto out;
1145 }
1146
1147 PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap));
1148 /* wait on bitmap page writebacks */
1149 while ((item = dequeue_page(bitmap))) {
1150 page = item->page;
1151 mempool_free(item, bitmap->write_pool);
1152 PRINTK("wait on page writeback: %p\n", page);
1153 wait_on_page_writeback(page);
1154 PRINTK("finished page writeback: %p\n", page);
1155
1156 err = PageError(page);
1157 page_cache_release(page);
1158 if (err) {
1159 printk(KERN_WARNING "%s: bitmap file writeback "
1160 "failed (page %lu): %d\n",
1161 bmname(bitmap), page->index, err);
1162 bitmap_file_kick(bitmap);
1163 goto out;
1164 }
1165 }
1166 out:
1167 wake_up(&bitmap->write_wait);
1168 if (err) {
1169 printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n",
1170 bmname(bitmap), err);
1171 daemon_exit(bitmap, &bitmap->writeback_daemon);
1172 }
1173}
1174
1175static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
1176 void (*func)(mddev_t *), char *name)
1177{
1178 mdk_thread_t *daemon;
1179 unsigned long flags;
1180 char namebuf[32];
1181
1182 spin_lock_irqsave(&bitmap->lock, flags);
1183 *ptr = NULL;
1184
1185 if (!bitmap->file) /* no need for daemon if there's no backing file */
1186 goto out_unlock;
1187
1188 spin_unlock_irqrestore(&bitmap->lock, flags);
1189
1190#if INJECT_FATAL_FAULT_2
1191 daemon = NULL;
1192#else
1193 sprintf(namebuf, "%%s_%s", name);
1194 daemon = md_register_thread(func, bitmap->mddev, namebuf);
1195#endif
1196 if (!daemon) {
1197 printk(KERN_ERR "%s: failed to start bitmap daemon\n",
1198 bmname(bitmap));
1199 return -ECHILD;
1200 }
1201
1202 spin_lock_irqsave(&bitmap->lock, flags);
1203 *ptr = daemon;
1204
1205 md_wakeup_thread(daemon); /* start it running */
1206
1207 PRINTK("%s: %s daemon (pid %d) started...\n",
1208 bmname(bitmap), name, daemon->tsk->pid);
1209out_unlock:
1210 spin_unlock_irqrestore(&bitmap->lock, flags);
1211 return 0;
1212}
1213
1214static int bitmap_start_daemons(struct bitmap *bitmap)
1215{
1216 int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon,
1217 bitmap_writeback_daemon, "bitmap_wb");
1218 return err;
1219}
1220
1221static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr)
1222{
1223 mdk_thread_t *daemon;
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&bitmap->lock, flags);
1227 daemon = *ptr;
1228 *ptr = NULL;
1229 spin_unlock_irqrestore(&bitmap->lock, flags);
1230 if (daemon)
1231 md_unregister_thread(daemon); /* destroy the thread */
1232}
1233
1234static void bitmap_stop_daemons(struct bitmap *bitmap)
1235{
1236 /* the daemons can't stop themselves... they'll just exit instead... */
1237 if (bitmap->writeback_daemon &&
1238 current->pid != bitmap->writeback_daemon->tsk->pid)
1239 bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon);
1240}
1241
1242static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap,
1243 sector_t offset, int *blocks,
1244 int create)
1245{
1246 /* If 'create', we might release the lock and reclaim it.
1247 * The lock must have been taken with interrupts enabled.
1248 * If !create, we don't release the lock.
1249 */
1250 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap);
1251 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1252 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1253 sector_t csize;
1254
1255 if (bitmap_checkpage(bitmap, page, create) < 0) {
1256 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1257 *blocks = csize - (offset & (csize- 1));
1258 return NULL;
1259 }
1260 /* now locked ... */
1261
1262 if (bitmap->bp[page].hijacked) { /* hijacked pointer */
1263 /* should we use the first or second counter field
1264 * of the hijacked pointer? */
1265 int hi = (pageoff > PAGE_COUNTER_MASK);
1266 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) +
1267 PAGE_COUNTER_SHIFT - 1);
1268 *blocks = csize - (offset & (csize- 1));
1269 return &((bitmap_counter_t *)
1270 &bitmap->bp[page].map)[hi];
1271 } else { /* page is allocated */
1272 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap));
1273 *blocks = csize - (offset & (csize- 1));
1274 return (bitmap_counter_t *)
1275 &(bitmap->bp[page].map[pageoff]);
1276 }
1277}
1278
1279int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors)
1280{
1281 if (!bitmap) return 0;
1282 while (sectors) {
1283 int blocks;
1284 bitmap_counter_t *bmc;
1285
1286 spin_lock_irq(&bitmap->lock);
1287 bmc = bitmap_get_counter(bitmap, offset, &blocks, 1);
1288 if (!bmc) {
1289 spin_unlock_irq(&bitmap->lock);
1290 return 0;
1291 }
1292
1293 switch(*bmc) {
1294 case 0:
1295 bitmap_file_set_bit(bitmap, offset);
1296 bitmap_count_page(bitmap,offset, 1);
1297 blk_plug_device(bitmap->mddev->queue);
1298 /* fall through */
1299 case 1:
1300 *bmc = 2;
1301 }
1302 if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG();
1303 (*bmc)++;
1304
1305 spin_unlock_irq(&bitmap->lock);
1306
1307 offset += blocks;
1308 if (sectors > blocks)
1309 sectors -= blocks;
1310 else sectors = 0;
1311 }
1312 return 0;
1313}
1314
1315void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
1316 int success)
1317{
1318 if (!bitmap) return;
1319 while (sectors) {
1320 int blocks;
1321 unsigned long flags;
1322 bitmap_counter_t *bmc;
1323
1324 spin_lock_irqsave(&bitmap->lock, flags);
1325 bmc = bitmap_get_counter(bitmap, offset, &blocks, 0);
1326 if (!bmc) {
1327 spin_unlock_irqrestore(&bitmap->lock, flags);
1328 return;
1329 }
1330
1331 if (!success && ! (*bmc & NEEDED_MASK))
1332 *bmc |= NEEDED_MASK;
1333
1334 (*bmc)--;
1335 if (*bmc <= 2) {
1336 set_page_attr(bitmap,
1337 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1338 BITMAP_PAGE_CLEAN);
1339 }
1340 spin_unlock_irqrestore(&bitmap->lock, flags);
1341 offset += blocks;
1342 if (sectors > blocks)
1343 sectors -= blocks;
1344 else sectors = 0;
1345 }
1346}
1347
1348int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
1349{
1350 bitmap_counter_t *bmc;
1351 int rv;
1352 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
1353 *blocks = 1024;
1354 return 1; /* always resync if no bitmap */
1355 }
1356 spin_lock_irq(&bitmap->lock);
1357 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1358 rv = 0;
1359 if (bmc) {
1360 /* locked */
1361 if (RESYNC(*bmc))
1362 rv = 1;
1363 else if (NEEDED(*bmc)) {
1364 rv = 1;
1365 *bmc |= RESYNC_MASK;
1366 *bmc &= ~NEEDED_MASK;
1367 }
1368 }
1369 spin_unlock_irq(&bitmap->lock);
1370 return rv;
1371}
1372
1373void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted)
1374{
1375 bitmap_counter_t *bmc;
1376 unsigned long flags;
1377/*
1378 if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted);
1379*/ if (bitmap == NULL) {
1380 *blocks = 1024;
1381 return;
1382 }
1383 spin_lock_irqsave(&bitmap->lock, flags);
1384 bmc = bitmap_get_counter(bitmap, offset, blocks, 0);
1385 if (bmc == NULL)
1386 goto unlock;
1387 /* locked */
1388/*
1389 if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks);
1390*/
1391 if (RESYNC(*bmc)) {
1392 *bmc &= ~RESYNC_MASK;
1393
1394 if (!NEEDED(*bmc) && aborted)
1395 *bmc |= NEEDED_MASK;
1396 else {
1397 if (*bmc <= 2) {
1398 set_page_attr(bitmap,
1399 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)),
1400 BITMAP_PAGE_CLEAN);
1401 }
1402 }
1403 }
1404 unlock:
1405 spin_unlock_irqrestore(&bitmap->lock, flags);
1406}
1407
1408void bitmap_close_sync(struct bitmap *bitmap)
1409{
1410 /* Sync has finished, and any bitmap chunks that weren't synced
1411 * properly have been aborted. It remains to us to clear the
1412 * RESYNC bit wherever it is still on
1413 */
1414 sector_t sector = 0;
1415 int blocks;
1416 if (!bitmap) return;
1417 while (sector < bitmap->mddev->resync_max_sectors) {
1418 bitmap_end_sync(bitmap, sector, &blocks, 0);
1419/*
1420 if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n",
1421 (unsigned long long)sector, blocks);
1422*/ sector += blocks;
1423 }
1424}
1425
1426static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset,
1427 unsigned long sectors, int in_sync)
1428{
1429 /* For each chunk covered by any of these sectors, set the
1430 * counter to 1 and set resync_needed unless in_sync. They should all
1431 * be 0 at this point
1432 */
1433 while (sectors) {
1434 int secs;
1435 bitmap_counter_t *bmc;
1436 spin_lock_irq(&bitmap->lock);
1437 bmc = bitmap_get_counter(bitmap, offset, &secs, 1);
1438 if (!bmc) {
1439 spin_unlock_irq(&bitmap->lock);
1440 return;
1441 }
1442 if (! *bmc) {
1443 struct page *page;
1444 *bmc = 1 | (in_sync? 0 : NEEDED_MASK);
1445 bitmap_count_page(bitmap, offset, 1);
1446 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
1447 set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN);
1448 }
1449 spin_unlock_irq(&bitmap->lock);
1450 if (sectors > secs)
1451 sectors -= secs;
1452 else
1453 sectors = 0;
1454 }
1455}
1456
1457/*
1458 * free memory that was allocated
1459 */
1460void bitmap_destroy(mddev_t *mddev)
1461{
1462 unsigned long k, pages;
1463 struct bitmap_page *bp;
1464 struct bitmap *bitmap = mddev->bitmap;
1465
1466 if (!bitmap) /* there was no bitmap */
1467 return;
1468
1469 mddev->bitmap = NULL; /* disconnect from the md device */
1470
1471 /* release the bitmap file and kill the daemon */
1472 bitmap_file_put(bitmap);
1473
1474 bp = bitmap->bp;
1475 pages = bitmap->pages;
1476
1477 /* free all allocated memory */
1478
1479 mempool_destroy(bitmap->write_pool);
1480
1481 if (bp) /* deallocate the page memory */
1482 for (k = 0; k < pages; k++)
1483 if (bp[k].map && !bp[k].hijacked)
1484 kfree(bp[k].map);
1485 kfree(bp);
1486 kfree(bitmap);
1487}
1488
1489/*
1490 * initialize the bitmap structure
1491 * if this returns an error, bitmap_destroy must be called to do clean up
1492 */
1493int bitmap_create(mddev_t *mddev)
1494{
1495 struct bitmap *bitmap;
1496 unsigned long blocks = mddev->resync_max_sectors;
1497 unsigned long chunks;
1498 unsigned long pages;
1499 struct file *file = mddev->bitmap_file;
1500 int err;
1501
1502 BUG_ON(sizeof(bitmap_super_t) != 256);
1503
1504 if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */
1505 return 0;
1506
1507 BUG_ON(file && mddev->bitmap_offset);
1508
1509 bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL);
1510 if (!bitmap)
1511 return -ENOMEM;
1512
1513 memset(bitmap, 0, sizeof(*bitmap));
1514
1515 spin_lock_init(&bitmap->lock);
1516 bitmap->mddev = mddev;
1517 mddev->bitmap = bitmap;
1518
1519 spin_lock_init(&bitmap->write_lock);
1520 INIT_LIST_HEAD(&bitmap->complete_pages);
1521 init_waitqueue_head(&bitmap->write_wait);
1522 bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc,
1523 write_pool_free, NULL);
1524 if (!bitmap->write_pool)
1525 return -ENOMEM;
1526
1527 bitmap->file = file;
1528 bitmap->offset = mddev->bitmap_offset;
1529 if (file) get_file(file);
1530 /* read superblock from bitmap file (this sets bitmap->chunksize) */
1531 err = bitmap_read_sb(bitmap);
1532 if (err)
1533 return err;
1534
1535 bitmap->chunkshift = find_first_bit(&bitmap->chunksize,
1536 sizeof(bitmap->chunksize));
1537
1538 /* now that chunksize and chunkshift are set, we can use these macros */
1539 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) /
1540 CHUNK_BLOCK_RATIO(bitmap);
1541 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1542
1543 BUG_ON(!pages);
1544
1545 bitmap->chunks = chunks;
1546 bitmap->pages = pages;
1547 bitmap->missing_pages = pages;
1548 bitmap->counter_bits = COUNTER_BITS;
1549
1550 bitmap->syncchunk = ~0UL;
1551
1552#if INJECT_FATAL_FAULT_1
1553 bitmap->bp = NULL;
1554#else
1555 bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
1556#endif
1557 if (!bitmap->bp)
1558 return -ENOMEM;
1559 memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp));
1560
1561 bitmap->flags |= BITMAP_ACTIVE;
1562
1563 /* now that we have some pages available, initialize the in-memory
1564 * bitmap from the on-disk bitmap */
1565 err = bitmap_init_from_disk(bitmap, mddev->recovery_cp == MaxSector);
1566 if (err)
1567 return err;
1568
1569 printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
1570 pages, bmname(bitmap));
1571
1572 /* kick off the bitmap daemons */
1573 err = bitmap_start_daemons(bitmap);
1574 if (err)
1575 return err;
1576 return bitmap_update_sb(bitmap);
1577}
1578
1579/* the bitmap API -- for raid personalities */
1580EXPORT_SYMBOL(bitmap_startwrite);
1581EXPORT_SYMBOL(bitmap_endwrite);
1582EXPORT_SYMBOL(bitmap_start_sync);
1583EXPORT_SYMBOL(bitmap_end_sync);
1584EXPORT_SYMBOL(bitmap_unplug);
1585EXPORT_SYMBOL(bitmap_close_sync);
1586EXPORT_SYMBOL(bitmap_daemon_work);
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0dd6c2b5391b..d0a4bab220e5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -704,8 +704,7 @@ static void crypt_dtr(struct dm_target *ti)
704 mempool_destroy(cc->page_pool); 704 mempool_destroy(cc->page_pool);
705 mempool_destroy(cc->io_pool); 705 mempool_destroy(cc->io_pool);
706 706
707 if (cc->iv_mode) 707 kfree(cc->iv_mode);
708 kfree(cc->iv_mode);
709 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) 708 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
710 cc->iv_gen_ops->dtr(cc); 709 cc->iv_gen_ops->dtr(cc);
711 crypto_free_tfm(cc->tfm); 710 crypto_free_tfm(cc->tfm);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ee3c869d9701..200a0688f717 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -122,14 +122,6 @@ static struct hash_cell *__get_uuid_cell(const char *str)
122/*----------------------------------------------------------------- 122/*-----------------------------------------------------------------
123 * Inserting, removing and renaming a device. 123 * Inserting, removing and renaming a device.
124 *---------------------------------------------------------------*/ 124 *---------------------------------------------------------------*/
125static inline char *kstrdup(const char *str)
126{
127 char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
128 if (r)
129 strcpy(r, str);
130 return r;
131}
132
133static struct hash_cell *alloc_cell(const char *name, const char *uuid, 125static struct hash_cell *alloc_cell(const char *name, const char *uuid,
134 struct mapped_device *md) 126 struct mapped_device *md)
135{ 127{
@@ -139,7 +131,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
139 if (!hc) 131 if (!hc)
140 return NULL; 132 return NULL;
141 133
142 hc->name = kstrdup(name); 134 hc->name = kstrdup(name, GFP_KERNEL);
143 if (!hc->name) { 135 if (!hc->name) {
144 kfree(hc); 136 kfree(hc);
145 return NULL; 137 return NULL;
@@ -149,7 +141,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
149 hc->uuid = NULL; 141 hc->uuid = NULL;
150 142
151 else { 143 else {
152 hc->uuid = kstrdup(uuid); 144 hc->uuid = kstrdup(uuid, GFP_KERNEL);
153 if (!hc->uuid) { 145 if (!hc->uuid) {
154 kfree(hc->name); 146 kfree(hc->name);
155 kfree(hc); 147 kfree(hc);
@@ -273,7 +265,7 @@ static int dm_hash_rename(const char *old, const char *new)
273 /* 265 /*
274 * duplicate new. 266 * duplicate new.
275 */ 267 */
276 new_name = kstrdup(new); 268 new_name = kstrdup(new, GFP_KERNEL);
277 if (!new_name) 269 if (!new_name)
278 return -ENOMEM; 270 return -ENOMEM;
279 271
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 6e3cf7e13451..12031c9d3f1e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1060,6 +1060,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1060 } 1060 }
1061 1061
1062 ti->private = ms; 1062 ti->private = ms;
1063 ti->split_io = ms->rh.region_size;
1063 1064
1064 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1065 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
1065 if (r) { 1066 if (r) {
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index b1941b887f46..8d740013d74d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -217,8 +217,7 @@ static int linear_run (mddev_t *mddev)
217 return 0; 217 return 0;
218 218
219out: 219out:
220 if (conf) 220 kfree(conf);
221 kfree(conf);
222 return 1; 221 return 1;
223} 222}
224 223
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d899204d3743..4a0c57db2b67 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -19,6 +19,9 @@
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
22 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
@@ -33,6 +36,7 @@
33#include <linux/config.h> 36#include <linux/config.h>
34#include <linux/linkage.h> 37#include <linux/linkage.h>
35#include <linux/raid/md.h> 38#include <linux/raid/md.h>
39#include <linux/raid/bitmap.h>
36#include <linux/sysctl.h> 40#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h> 41#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */ 42#include <linux/buffer_head.h> /* for invalidate_bdev */
@@ -40,6 +44,8 @@
40 44
41#include <linux/init.h> 45#include <linux/init.h>
42 46
47#include <linux/file.h>
48
43#ifdef CONFIG_KMOD 49#ifdef CONFIG_KMOD
44#include <linux/kmod.h> 50#include <linux/kmod.h>
45#endif 51#endif
@@ -189,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit)
189 if (mddev->unit == unit) { 195 if (mddev->unit == unit) {
190 mddev_get(mddev); 196 mddev_get(mddev);
191 spin_unlock(&all_mddevs_lock); 197 spin_unlock(&all_mddevs_lock);
192 if (new) 198 kfree(new);
193 kfree(new);
194 return mddev; 199 return mddev;
195 } 200 }
196 201
@@ -218,6 +223,8 @@ static mddev_t * mddev_find(dev_t unit)
218 INIT_LIST_HEAD(&new->all_mddevs); 223 INIT_LIST_HEAD(&new->all_mddevs);
219 init_timer(&new->safemode_timer); 224 init_timer(&new->safemode_timer);
220 atomic_set(&new->active, 1); 225 atomic_set(&new->active, 1);
226 spin_lock_init(&new->write_lock);
227 init_waitqueue_head(&new->sb_wait);
221 228
222 new->queue = blk_alloc_queue(GFP_KERNEL); 229 new->queue = blk_alloc_queue(GFP_KERNEL);
223 if (!new->queue) { 230 if (!new->queue) {
@@ -320,6 +327,41 @@ static void free_disk_sb(mdk_rdev_t * rdev)
320} 327}
321 328
322 329
330static int super_written(struct bio *bio, unsigned int bytes_done, int error)
331{
332 mdk_rdev_t *rdev = bio->bi_private;
333 if (bio->bi_size)
334 return 1;
335
336 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
337 md_error(rdev->mddev, rdev);
338
339 if (atomic_dec_and_test(&rdev->mddev->pending_writes))
340 wake_up(&rdev->mddev->sb_wait);
341 bio_put(bio);
342 return 0;
343}
344
345void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
346 sector_t sector, int size, struct page *page)
347{
348 /* write first size bytes of page to sector of rdev
349 * Increment mddev->pending_writes before returning
350 * and decrement it on completion, waking up sb_wait
351 * if zero is reached.
352 * If an error occurred, call md_error
353 */
354 struct bio *bio = bio_alloc(GFP_NOIO, 1);
355
356 bio->bi_bdev = rdev->bdev;
357 bio->bi_sector = sector;
358 bio_add_page(bio, page, size, 0);
359 bio->bi_private = rdev;
360 bio->bi_end_io = super_written;
361 atomic_inc(&mddev->pending_writes);
362 submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
363}
364
323static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) 365static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
324{ 366{
325 if (bio->bi_size) 367 if (bio->bi_size)
@@ -329,7 +371,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
329 return 0; 371 return 0;
330} 372}
331 373
332static int sync_page_io(struct block_device *bdev, sector_t sector, int size, 374int sync_page_io(struct block_device *bdev, sector_t sector, int size,
333 struct page *page, int rw) 375 struct page *page, int rw)
334{ 376{
335 struct bio *bio = bio_alloc(GFP_NOIO, 1); 377 struct bio *bio = bio_alloc(GFP_NOIO, 1);
@@ -416,11 +458,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
416 ret = 1; 458 ret = 1;
417 459
418abort: 460abort:
419 if (tmp1) 461 kfree(tmp1);
420 kfree(tmp1); 462 kfree(tmp2);
421 if (tmp2)
422 kfree(tmp2);
423
424 return ret; 463 return ret;
425} 464}
426 465
@@ -569,6 +608,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
569 mdp_disk_t *desc; 608 mdp_disk_t *desc;
570 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 609 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
571 610
611 rdev->raid_disk = -1;
612 rdev->in_sync = 0;
572 if (mddev->raid_disks == 0) { 613 if (mddev->raid_disks == 0) {
573 mddev->major_version = 0; 614 mddev->major_version = 0;
574 mddev->minor_version = sb->minor_version; 615 mddev->minor_version = sb->minor_version;
@@ -599,16 +640,35 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
599 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 640 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
600 641
601 mddev->max_disks = MD_SB_DISKS; 642 mddev->max_disks = MD_SB_DISKS;
602 } else { 643
603 __u64 ev1; 644 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
604 ev1 = md_event(sb); 645 mddev->bitmap_file == NULL) {
646 if (mddev->level != 1) {
647 /* FIXME use a better test */
648 printk(KERN_WARNING "md: bitmaps only support for raid1\n");
649 return -EINVAL;
650 }
651 mddev->bitmap_offset = (MD_SB_BYTES >> 9);
652 }
653
654 } else if (mddev->pers == NULL) {
655 /* Insist on good event counter while assembling */
656 __u64 ev1 = md_event(sb);
605 ++ev1; 657 ++ev1;
606 if (ev1 < mddev->events) 658 if (ev1 < mddev->events)
607 return -EINVAL; 659 return -EINVAL;
608 } 660 } else if (mddev->bitmap) {
661 /* if adding to array with a bitmap, then we can accept an
662 * older device ... but not too old.
663 */
664 __u64 ev1 = md_event(sb);
665 if (ev1 < mddev->bitmap->events_cleared)
666 return 0;
667 } else /* just a hot-add of a new device, leave raid_disk at -1 */
668 return 0;
669
609 if (mddev->level != LEVEL_MULTIPATH) { 670 if (mddev->level != LEVEL_MULTIPATH) {
610 rdev->raid_disk = -1; 671 rdev->faulty = 0;
611 rdev->in_sync = rdev->faulty = 0;
612 desc = sb->disks + rdev->desc_nr; 672 desc = sb->disks + rdev->desc_nr;
613 673
614 if (desc->state & (1<<MD_DISK_FAULTY)) 674 if (desc->state & (1<<MD_DISK_FAULTY))
@@ -618,7 +678,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
618 rdev->in_sync = 1; 678 rdev->in_sync = 1;
619 rdev->raid_disk = desc->raid_disk; 679 rdev->raid_disk = desc->raid_disk;
620 } 680 }
621 } 681 } else /* MULTIPATH are always insync */
682 rdev->in_sync = 1;
622 return 0; 683 return 0;
623} 684}
624 685
@@ -683,6 +744,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
683 sb->layout = mddev->layout; 744 sb->layout = mddev->layout;
684 sb->chunk_size = mddev->chunk_size; 745 sb->chunk_size = mddev->chunk_size;
685 746
747 if (mddev->bitmap && mddev->bitmap_file == NULL)
748 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
749
686 sb->disks[0].state = (1<<MD_DISK_REMOVED); 750 sb->disks[0].state = (1<<MD_DISK_REMOVED);
687 ITERATE_RDEV(mddev,rdev2,tmp) { 751 ITERATE_RDEV(mddev,rdev2,tmp) {
688 mdp_disk_t *d; 752 mdp_disk_t *d;
@@ -780,7 +844,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
780 case 0: 844 case 0:
781 sb_offset = rdev->bdev->bd_inode->i_size >> 9; 845 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
782 sb_offset -= 8*2; 846 sb_offset -= 8*2;
783 sb_offset &= ~(4*2-1); 847 sb_offset &= ~(sector_t)(4*2-1);
784 /* convert from sectors to K */ 848 /* convert from sectors to K */
785 sb_offset /= 2; 849 sb_offset /= 2;
786 break; 850 break;
@@ -860,6 +924,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
860{ 924{
861 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 925 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
862 926
927 rdev->raid_disk = -1;
928 rdev->in_sync = 0;
863 if (mddev->raid_disks == 0) { 929 if (mddev->raid_disks == 0) {
864 mddev->major_version = 1; 930 mddev->major_version = 1;
865 mddev->patch_version = 0; 931 mddev->patch_version = 0;
@@ -877,13 +943,30 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
877 memcpy(mddev->uuid, sb->set_uuid, 16); 943 memcpy(mddev->uuid, sb->set_uuid, 16);
878 944
879 mddev->max_disks = (4096-256)/2; 945 mddev->max_disks = (4096-256)/2;
880 } else { 946
881 __u64 ev1; 947 if ((le32_to_cpu(sb->feature_map) & 1) &&
882 ev1 = le64_to_cpu(sb->events); 948 mddev->bitmap_file == NULL ) {
949 if (mddev->level != 1) {
950 printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
951 return -EINVAL;
952 }
953 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
954 }
955 } else if (mddev->pers == NULL) {
956 /* Insist of good event counter while assembling */
957 __u64 ev1 = le64_to_cpu(sb->events);
883 ++ev1; 958 ++ev1;
884 if (ev1 < mddev->events) 959 if (ev1 < mddev->events)
885 return -EINVAL; 960 return -EINVAL;
886 } 961 } else if (mddev->bitmap) {
962 /* If adding to array with a bitmap, then we can accept an
963 * older device, but not too old.
964 */
965 __u64 ev1 = le64_to_cpu(sb->events);
966 if (ev1 < mddev->bitmap->events_cleared)
967 return 0;
968 } else /* just a hot-add of a new device, leave raid_disk at -1 */
969 return 0;
887 970
888 if (mddev->level != LEVEL_MULTIPATH) { 971 if (mddev->level != LEVEL_MULTIPATH) {
889 int role; 972 int role;
@@ -891,14 +974,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
891 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 974 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
892 switch(role) { 975 switch(role) {
893 case 0xffff: /* spare */ 976 case 0xffff: /* spare */
894 rdev->in_sync = 0;
895 rdev->faulty = 0; 977 rdev->faulty = 0;
896 rdev->raid_disk = -1;
897 break; 978 break;
898 case 0xfffe: /* faulty */ 979 case 0xfffe: /* faulty */
899 rdev->in_sync = 0;
900 rdev->faulty = 1; 980 rdev->faulty = 1;
901 rdev->raid_disk = -1;
902 break; 981 break;
903 default: 982 default:
904 rdev->in_sync = 1; 983 rdev->in_sync = 1;
@@ -906,7 +985,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
906 rdev->raid_disk = role; 985 rdev->raid_disk = role;
907 break; 986 break;
908 } 987 }
909 } 988 } else /* MULTIPATH are always insync */
989 rdev->in_sync = 1;
990
910 return 0; 991 return 0;
911} 992}
912 993
@@ -933,6 +1014,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
933 else 1014 else
934 sb->resync_offset = cpu_to_le64(0); 1015 sb->resync_offset = cpu_to_le64(0);
935 1016
1017 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1018 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1019 sb->feature_map = cpu_to_le32(1);
1020 }
1021
936 max_dev = 0; 1022 max_dev = 0;
937 ITERATE_RDEV(mddev,rdev2,tmp) 1023 ITERATE_RDEV(mddev,rdev2,tmp)
938 if (rdev2->desc_nr+1 > max_dev) 1024 if (rdev2->desc_nr+1 > max_dev)
@@ -1196,8 +1282,11 @@ void md_print_devices(void)
1196 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1282 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1197 printk("md: **********************************\n"); 1283 printk("md: **********************************\n");
1198 ITERATE_MDDEV(mddev,tmp) { 1284 ITERATE_MDDEV(mddev,tmp) {
1199 printk("%s: ", mdname(mddev));
1200 1285
1286 if (mddev->bitmap)
1287 bitmap_print_sb(mddev->bitmap);
1288 else
1289 printk("%s: ", mdname(mddev));
1201 ITERATE_RDEV(mddev,rdev,tmp2) 1290 ITERATE_RDEV(mddev,rdev,tmp2)
1202 printk("<%s>", bdevname(rdev->bdev,b)); 1291 printk("<%s>", bdevname(rdev->bdev,b));
1203 printk("\n"); 1292 printk("\n");
@@ -1210,30 +1299,6 @@ void md_print_devices(void)
1210} 1299}
1211 1300
1212 1301
1213static int write_disk_sb(mdk_rdev_t * rdev)
1214{
1215 char b[BDEVNAME_SIZE];
1216 if (!rdev->sb_loaded) {
1217 MD_BUG();
1218 return 1;
1219 }
1220 if (rdev->faulty) {
1221 MD_BUG();
1222 return 1;
1223 }
1224
1225 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1226 bdevname(rdev->bdev,b),
1227 (unsigned long long)rdev->sb_offset);
1228
1229 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
1230 return 0;
1231
1232 printk("md: write_disk_sb failed for device %s\n",
1233 bdevname(rdev->bdev,b));
1234 return 1;
1235}
1236
1237static void sync_sbs(mddev_t * mddev) 1302static void sync_sbs(mddev_t * mddev)
1238{ 1303{
1239 mdk_rdev_t *rdev; 1304 mdk_rdev_t *rdev;
@@ -1248,12 +1313,14 @@ static void sync_sbs(mddev_t * mddev)
1248 1313
1249static void md_update_sb(mddev_t * mddev) 1314static void md_update_sb(mddev_t * mddev)
1250{ 1315{
1251 int err, count = 100; 1316 int err;
1252 struct list_head *tmp; 1317 struct list_head *tmp;
1253 mdk_rdev_t *rdev; 1318 mdk_rdev_t *rdev;
1319 int sync_req;
1254 1320
1255 mddev->sb_dirty = 0;
1256repeat: 1321repeat:
1322 spin_lock(&mddev->write_lock);
1323 sync_req = mddev->in_sync;
1257 mddev->utime = get_seconds(); 1324 mddev->utime = get_seconds();
1258 mddev->events ++; 1325 mddev->events ++;
1259 1326
@@ -1266,20 +1333,26 @@ repeat:
1266 MD_BUG(); 1333 MD_BUG();
1267 mddev->events --; 1334 mddev->events --;
1268 } 1335 }
1336 mddev->sb_dirty = 2;
1269 sync_sbs(mddev); 1337 sync_sbs(mddev);
1270 1338
1271 /* 1339 /*
1272 * do not write anything to disk if using 1340 * do not write anything to disk if using
1273 * nonpersistent superblocks 1341 * nonpersistent superblocks
1274 */ 1342 */
1275 if (!mddev->persistent) 1343 if (!mddev->persistent) {
1344 mddev->sb_dirty = 0;
1345 spin_unlock(&mddev->write_lock);
1346 wake_up(&mddev->sb_wait);
1276 return; 1347 return;
1348 }
1349 spin_unlock(&mddev->write_lock);
1277 1350
1278 dprintk(KERN_INFO 1351 dprintk(KERN_INFO
1279 "md: updating %s RAID superblock on device (in sync %d)\n", 1352 "md: updating %s RAID superblock on device (in sync %d)\n",
1280 mdname(mddev),mddev->in_sync); 1353 mdname(mddev),mddev->in_sync);
1281 1354
1282 err = 0; 1355 err = bitmap_update_sb(mddev->bitmap);
1283 ITERATE_RDEV(mddev,rdev,tmp) { 1356 ITERATE_RDEV(mddev,rdev,tmp) {
1284 char b[BDEVNAME_SIZE]; 1357 char b[BDEVNAME_SIZE];
1285 dprintk(KERN_INFO "md: "); 1358 dprintk(KERN_INFO "md: ");
@@ -1288,22 +1361,32 @@ repeat:
1288 1361
1289 dprintk("%s ", bdevname(rdev->bdev,b)); 1362 dprintk("%s ", bdevname(rdev->bdev,b));
1290 if (!rdev->faulty) { 1363 if (!rdev->faulty) {
1291 err += write_disk_sb(rdev); 1364 md_super_write(mddev,rdev,
1365 rdev->sb_offset<<1, MD_SB_BYTES,
1366 rdev->sb_page);
1367 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1368 bdevname(rdev->bdev,b),
1369 (unsigned long long)rdev->sb_offset);
1370
1292 } else 1371 } else
1293 dprintk(")\n"); 1372 dprintk(")\n");
1294 if (!err && mddev->level == LEVEL_MULTIPATH) 1373 if (mddev->level == LEVEL_MULTIPATH)
1295 /* only need to write one superblock... */ 1374 /* only need to write one superblock... */
1296 break; 1375 break;
1297 } 1376 }
1298 if (err) { 1377 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
1299 if (--count) { 1378 /* if there was a failure, sb_dirty was set to 1, and we re-write super */
1300 printk(KERN_ERR "md: errors occurred during superblock" 1379
1301 " update, repeating\n"); 1380 spin_lock(&mddev->write_lock);
1302 goto repeat; 1381 if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
1303 } 1382 /* have to write it out again */
1304 printk(KERN_ERR \ 1383 spin_unlock(&mddev->write_lock);
1305 "md: excessive errors occurred during superblock update, exiting\n"); 1384 goto repeat;
1306 } 1385 }
1386 mddev->sb_dirty = 0;
1387 spin_unlock(&mddev->write_lock);
1388 wake_up(&mddev->sb_wait);
1389
1307} 1390}
1308 1391
1309/* 1392/*
@@ -1607,12 +1690,19 @@ static int do_md_run(mddev_t * mddev)
1607 1690
1608 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ 1691 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1609 1692
1610 err = mddev->pers->run(mddev); 1693 /* before we start the array running, initialise the bitmap */
1694 err = bitmap_create(mddev);
1695 if (err)
1696 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
1697 mdname(mddev), err);
1698 else
1699 err = mddev->pers->run(mddev);
1611 if (err) { 1700 if (err) {
1612 printk(KERN_ERR "md: pers->run() failed ...\n"); 1701 printk(KERN_ERR "md: pers->run() failed ...\n");
1613 module_put(mddev->pers->owner); 1702 module_put(mddev->pers->owner);
1614 mddev->pers = NULL; 1703 mddev->pers = NULL;
1615 return -EINVAL; 1704 bitmap_destroy(mddev);
1705 return err;
1616 } 1706 }
1617 atomic_set(&mddev->writes_pending,0); 1707 atomic_set(&mddev->writes_pending,0);
1618 mddev->safemode = 0; 1708 mddev->safemode = 0;
@@ -1725,6 +1815,14 @@ static int do_md_stop(mddev_t * mddev, int ro)
1725 if (ro) 1815 if (ro)
1726 set_disk_ro(disk, 1); 1816 set_disk_ro(disk, 1);
1727 } 1817 }
1818
1819 bitmap_destroy(mddev);
1820 if (mddev->bitmap_file) {
1821 atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1);
1822 fput(mddev->bitmap_file);
1823 mddev->bitmap_file = NULL;
1824 }
1825
1728 /* 1826 /*
1729 * Free resources if final stop 1827 * Free resources if final stop
1730 */ 1828 */
@@ -1983,6 +2081,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
1983 return 0; 2081 return 0;
1984} 2082}
1985 2083
2084static int get_bitmap_file(mddev_t * mddev, void * arg)
2085{
2086 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
2087 char *ptr, *buf = NULL;
2088 int err = -ENOMEM;
2089
2090 file = kmalloc(sizeof(*file), GFP_KERNEL);
2091 if (!file)
2092 goto out;
2093
2094 /* bitmap disabled, zero the first byte and copy out */
2095 if (!mddev->bitmap || !mddev->bitmap->file) {
2096 file->pathname[0] = '\0';
2097 goto copy_out;
2098 }
2099
2100 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
2101 if (!buf)
2102 goto out;
2103
2104 ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
2105 if (!ptr)
2106 goto out;
2107
2108 strcpy(file->pathname, ptr);
2109
2110copy_out:
2111 err = 0;
2112 if (copy_to_user(arg, file, sizeof(*file)))
2113 err = -EFAULT;
2114out:
2115 kfree(buf);
2116 kfree(file);
2117 return err;
2118}
2119
1986static int get_disk_info(mddev_t * mddev, void __user * arg) 2120static int get_disk_info(mddev_t * mddev, void __user * arg)
1987{ 2121{
1988 mdu_disk_info_t info; 2122 mdu_disk_info_t info;
@@ -2078,11 +2212,25 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2078 PTR_ERR(rdev)); 2212 PTR_ERR(rdev));
2079 return PTR_ERR(rdev); 2213 return PTR_ERR(rdev);
2080 } 2214 }
2215 /* set save_raid_disk if appropriate */
2216 if (!mddev->persistent) {
2217 if (info->state & (1<<MD_DISK_SYNC) &&
2218 info->raid_disk < mddev->raid_disks)
2219 rdev->raid_disk = info->raid_disk;
2220 else
2221 rdev->raid_disk = -1;
2222 } else
2223 super_types[mddev->major_version].
2224 validate_super(mddev, rdev);
2225 rdev->saved_raid_disk = rdev->raid_disk;
2226
2081 rdev->in_sync = 0; /* just to be sure */ 2227 rdev->in_sync = 0; /* just to be sure */
2082 rdev->raid_disk = -1; 2228 rdev->raid_disk = -1;
2083 err = bind_rdev_to_array(rdev, mddev); 2229 err = bind_rdev_to_array(rdev, mddev);
2084 if (err) 2230 if (err)
2085 export_rdev(rdev); 2231 export_rdev(rdev);
2232
2233 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2086 if (mddev->thread) 2234 if (mddev->thread)
2087 md_wakeup_thread(mddev->thread); 2235 md_wakeup_thread(mddev->thread);
2088 return err; 2236 return err;
@@ -2256,6 +2404,49 @@ abort_export:
2256 return err; 2404 return err;
2257} 2405}
2258 2406
2407/* similar to deny_write_access, but accounts for our holding a reference
2408 * to the file ourselves */
2409static int deny_bitmap_write_access(struct file * file)
2410{
2411 struct inode *inode = file->f_mapping->host;
2412
2413 spin_lock(&inode->i_lock);
2414 if (atomic_read(&inode->i_writecount) > 1) {
2415 spin_unlock(&inode->i_lock);
2416 return -ETXTBSY;
2417 }
2418 atomic_set(&inode->i_writecount, -1);
2419 spin_unlock(&inode->i_lock);
2420
2421 return 0;
2422}
2423
2424static int set_bitmap_file(mddev_t *mddev, int fd)
2425{
2426 int err;
2427
2428 if (mddev->pers)
2429 return -EBUSY;
2430
2431 mddev->bitmap_file = fget(fd);
2432
2433 if (mddev->bitmap_file == NULL) {
2434 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
2435 mdname(mddev));
2436 return -EBADF;
2437 }
2438
2439 err = deny_bitmap_write_access(mddev->bitmap_file);
2440 if (err) {
2441 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
2442 mdname(mddev));
2443 fput(mddev->bitmap_file);
2444 mddev->bitmap_file = NULL;
2445 } else
2446 mddev->bitmap_offset = 0; /* file overrides offset */
2447 return err;
2448}
2449
2259/* 2450/*
2260 * set_array_info is used two different ways 2451 * set_array_info is used two different ways
2261 * The original usage is when creating a new array. 2452 * The original usage is when creating a new array.
@@ -2567,8 +2758,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2567 /* 2758 /*
2568 * Commands querying/configuring an existing array: 2759 * Commands querying/configuring an existing array:
2569 */ 2760 */
2570 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ 2761 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
2571 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { 2762 * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
2763 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
2764 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
2572 err = -ENODEV; 2765 err = -ENODEV;
2573 goto abort_unlock; 2766 goto abort_unlock;
2574 } 2767 }
@@ -2582,6 +2775,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2582 err = get_array_info(mddev, argp); 2775 err = get_array_info(mddev, argp);
2583 goto done_unlock; 2776 goto done_unlock;
2584 2777
2778 case GET_BITMAP_FILE:
2779 err = get_bitmap_file(mddev, (void *)arg);
2780 goto done_unlock;
2781
2585 case GET_DISK_INFO: 2782 case GET_DISK_INFO:
2586 err = get_disk_info(mddev, argp); 2783 err = get_disk_info(mddev, argp);
2587 goto done_unlock; 2784 goto done_unlock;
@@ -2662,6 +2859,10 @@ static int md_ioctl(struct inode *inode, struct file *file,
2662 err = do_md_run (mddev); 2859 err = do_md_run (mddev);
2663 goto done_unlock; 2860 goto done_unlock;
2664 2861
2862 case SET_BITMAP_FILE:
2863 err = set_bitmap_file(mddev, (int)arg);
2864 goto done_unlock;
2865
2665 default: 2866 default:
2666 if (_IOC_TYPE(cmd) == MD_MAJOR) 2867 if (_IOC_TYPE(cmd) == MD_MAJOR)
2667 printk(KERN_WARNING "md: %s(pid %d) used" 2868 printk(KERN_WARNING "md: %s(pid %d) used"
@@ -2773,10 +2974,10 @@ static int md_thread(void * arg)
2773 while (thread->run) { 2974 while (thread->run) {
2774 void (*run)(mddev_t *); 2975 void (*run)(mddev_t *);
2775 2976
2776 wait_event_interruptible(thread->wqueue, 2977 wait_event_interruptible_timeout(thread->wqueue,
2777 test_bit(THREAD_WAKEUP, &thread->flags)); 2978 test_bit(THREAD_WAKEUP, &thread->flags),
2778 if (current->flags & PF_FREEZE) 2979 thread->timeout);
2779 refrigerator(PF_FREEZE); 2980 try_to_freeze();
2780 2981
2781 clear_bit(THREAD_WAKEUP, &thread->flags); 2982 clear_bit(THREAD_WAKEUP, &thread->flags);
2782 2983
@@ -2820,6 +3021,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2820 thread->run = run; 3021 thread->run = run;
2821 thread->mddev = mddev; 3022 thread->mddev = mddev;
2822 thread->name = name; 3023 thread->name = name;
3024 thread->timeout = MAX_SCHEDULE_TIMEOUT;
2823 ret = kernel_thread(md_thread, thread, 0); 3025 ret = kernel_thread(md_thread, thread, 0);
2824 if (ret < 0) { 3026 if (ret < 0) {
2825 kfree(thread); 3027 kfree(thread);
@@ -2858,13 +3060,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2858 3060
2859 if (!rdev || rdev->faulty) 3061 if (!rdev || rdev->faulty)
2860 return; 3062 return;
2861 3063/*
2862 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 3064 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2863 mdname(mddev), 3065 mdname(mddev),
2864 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 3066 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2865 __builtin_return_address(0),__builtin_return_address(1), 3067 __builtin_return_address(0),__builtin_return_address(1),
2866 __builtin_return_address(2),__builtin_return_address(3)); 3068 __builtin_return_address(2),__builtin_return_address(3));
2867 3069*/
2868 if (!mddev->pers->error_handler) 3070 if (!mddev->pers->error_handler)
2869 return; 3071 return;
2870 mddev->pers->error_handler(mddev,rdev); 3072 mddev->pers->error_handler(mddev,rdev);
@@ -3018,6 +3220,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
3018 struct list_head *tmp2; 3220 struct list_head *tmp2;
3019 mdk_rdev_t *rdev; 3221 mdk_rdev_t *rdev;
3020 int i; 3222 int i;
3223 struct bitmap *bitmap;
3021 3224
3022 if (v == (void*)1) { 3225 if (v == (void*)1) {
3023 seq_printf(seq, "Personalities : "); 3226 seq_printf(seq, "Personalities : ");
@@ -3070,10 +3273,35 @@ static int md_seq_show(struct seq_file *seq, void *v)
3070 if (mddev->pers) { 3273 if (mddev->pers) {
3071 mddev->pers->status (seq, mddev); 3274 mddev->pers->status (seq, mddev);
3072 seq_printf(seq, "\n "); 3275 seq_printf(seq, "\n ");
3073 if (mddev->curr_resync > 2) 3276 if (mddev->curr_resync > 2) {
3074 status_resync (seq, mddev); 3277 status_resync (seq, mddev);
3075 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 3278 seq_printf(seq, "\n ");
3076 seq_printf(seq, " resync=DELAYED"); 3279 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3280 seq_printf(seq, " resync=DELAYED\n ");
3281 } else
3282 seq_printf(seq, "\n ");
3283
3284 if ((bitmap = mddev->bitmap)) {
3285 unsigned long chunk_kb;
3286 unsigned long flags;
3287 spin_lock_irqsave(&bitmap->lock, flags);
3288 chunk_kb = bitmap->chunksize >> 10;
3289 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
3290 "%lu%s chunk",
3291 bitmap->pages - bitmap->missing_pages,
3292 bitmap->pages,
3293 (bitmap->pages - bitmap->missing_pages)
3294 << (PAGE_SHIFT - 10),
3295 chunk_kb ? chunk_kb : bitmap->chunksize,
3296 chunk_kb ? "KB" : "B");
3297 if (bitmap->file) {
3298 seq_printf(seq, ", file: ");
3299 seq_path(seq, bitmap->file->f_vfsmnt,
3300 bitmap->file->f_dentry," \t\n");
3301 }
3302
3303 seq_printf(seq, "\n");
3304 spin_unlock_irqrestore(&bitmap->lock, flags);
3077 } 3305 }
3078 3306
3079 seq_printf(seq, "\n"); 3307 seq_printf(seq, "\n");
@@ -3176,19 +3404,28 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
3176} 3404}
3177 3405
3178 3406
3179void md_write_start(mddev_t *mddev) 3407/* md_write_start(mddev, bi)
3408 * If we need to update some array metadata (e.g. 'active' flag
3409 * in superblock) before writing, schedule a superblock update
3410 * and wait for it to complete.
3411 */
3412void md_write_start(mddev_t *mddev, struct bio *bi)
3180{ 3413{
3181 if (!atomic_read(&mddev->writes_pending)) { 3414 DEFINE_WAIT(w);
3182 mddev_lock_uninterruptible(mddev); 3415 if (bio_data_dir(bi) != WRITE)
3416 return;
3417
3418 atomic_inc(&mddev->writes_pending);
3419 if (mddev->in_sync) {
3420 spin_lock(&mddev->write_lock);
3183 if (mddev->in_sync) { 3421 if (mddev->in_sync) {
3184 mddev->in_sync = 0; 3422 mddev->in_sync = 0;
3185 del_timer(&mddev->safemode_timer); 3423 mddev->sb_dirty = 1;
3186 md_update_sb(mddev); 3424 md_wakeup_thread(mddev->thread);
3187 } 3425 }
3188 atomic_inc(&mddev->writes_pending); 3426 spin_unlock(&mddev->write_lock);
3189 mddev_unlock(mddev); 3427 }
3190 } else 3428 wait_event(mddev->sb_wait, mddev->sb_dirty==0);
3191 atomic_inc(&mddev->writes_pending);
3192} 3429}
3193 3430
3194void md_write_end(mddev_t *mddev) 3431void md_write_end(mddev_t *mddev)
@@ -3201,37 +3438,6 @@ void md_write_end(mddev_t *mddev)
3201 } 3438 }
3202} 3439}
3203 3440
3204static inline void md_enter_safemode(mddev_t *mddev)
3205{
3206 if (!mddev->safemode) return;
3207 if (mddev->safemode == 2 &&
3208 (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
3209 mddev->recovery_cp != MaxSector))
3210 return; /* avoid the lock */
3211 mddev_lock_uninterruptible(mddev);
3212 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3213 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3214 mddev->in_sync = 1;
3215 md_update_sb(mddev);
3216 }
3217 mddev_unlock(mddev);
3218
3219 if (mddev->safemode == 1)
3220 mddev->safemode = 0;
3221}
3222
3223void md_handle_safemode(mddev_t *mddev)
3224{
3225 if (signal_pending(current)) {
3226 printk(KERN_INFO "md: %s in immediate safe mode\n",
3227 mdname(mddev));
3228 mddev->safemode = 2;
3229 flush_signals(current);
3230 }
3231 md_enter_safemode(mddev);
3232}
3233
3234
3235static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 3441static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3236 3442
3237#define SYNC_MARKS 10 3443#define SYNC_MARKS 10
@@ -3241,12 +3447,13 @@ static void md_do_sync(mddev_t *mddev)
3241 mddev_t *mddev2; 3447 mddev_t *mddev2;
3242 unsigned int currspeed = 0, 3448 unsigned int currspeed = 0,
3243 window; 3449 window;
3244 sector_t max_sectors,j; 3450 sector_t max_sectors,j, io_sectors;
3245 unsigned long mark[SYNC_MARKS]; 3451 unsigned long mark[SYNC_MARKS];
3246 sector_t mark_cnt[SYNC_MARKS]; 3452 sector_t mark_cnt[SYNC_MARKS];
3247 int last_mark,m; 3453 int last_mark,m;
3248 struct list_head *tmp; 3454 struct list_head *tmp;
3249 sector_t last_check; 3455 sector_t last_check;
3456 int skipped = 0;
3250 3457
3251 /* just incase thread restarts... */ 3458 /* just incase thread restarts... */
3252 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 3459 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -3312,7 +3519,7 @@ static void md_do_sync(mddev_t *mddev)
3312 3519
3313 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3520 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3314 /* resync follows the size requested by the personality, 3521 /* resync follows the size requested by the personality,
3315 * which default to physical size, but can be virtual size 3522 * which defaults to physical size, but can be virtual size
3316 */ 3523 */
3317 max_sectors = mddev->resync_max_sectors; 3524 max_sectors = mddev->resync_max_sectors;
3318 else 3525 else
@@ -3327,13 +3534,15 @@ static void md_do_sync(mddev_t *mddev)
3327 sysctl_speed_limit_max); 3534 sysctl_speed_limit_max);
3328 3535
3329 is_mddev_idle(mddev); /* this also initializes IO event counters */ 3536 is_mddev_idle(mddev); /* this also initializes IO event counters */
3330 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3537 /* we don't use the checkpoint if there's a bitmap */
3538 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap)
3331 j = mddev->recovery_cp; 3539 j = mddev->recovery_cp;
3332 else 3540 else
3333 j = 0; 3541 j = 0;
3542 io_sectors = 0;
3334 for (m = 0; m < SYNC_MARKS; m++) { 3543 for (m = 0; m < SYNC_MARKS; m++) {
3335 mark[m] = jiffies; 3544 mark[m] = jiffies;
3336 mark_cnt[m] = j; 3545 mark_cnt[m] = io_sectors;
3337 } 3546 }
3338 last_mark = 0; 3547 last_mark = 0;
3339 mddev->resync_mark = mark[last_mark]; 3548 mddev->resync_mark = mark[last_mark];
@@ -3358,21 +3567,29 @@ static void md_do_sync(mddev_t *mddev)
3358 } 3567 }
3359 3568
3360 while (j < max_sectors) { 3569 while (j < max_sectors) {
3361 int sectors; 3570 sector_t sectors;
3362 3571
3363 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); 3572 skipped = 0;
3364 if (sectors < 0) { 3573 sectors = mddev->pers->sync_request(mddev, j, &skipped,
3574 currspeed < sysctl_speed_limit_min);
3575 if (sectors == 0) {
3365 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 3576 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3366 goto out; 3577 goto out;
3367 } 3578 }
3368 atomic_add(sectors, &mddev->recovery_active); 3579
3580 if (!skipped) { /* actual IO requested */
3581 io_sectors += sectors;
3582 atomic_add(sectors, &mddev->recovery_active);
3583 }
3584
3369 j += sectors; 3585 j += sectors;
3370 if (j>1) mddev->curr_resync = j; 3586 if (j>1) mddev->curr_resync = j;
3371 3587
3372 if (last_check + window > j || j == max_sectors) 3588
3589 if (last_check + window > io_sectors || j == max_sectors)
3373 continue; 3590 continue;
3374 3591
3375 last_check = j; 3592 last_check = io_sectors;
3376 3593
3377 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || 3594 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3378 test_bit(MD_RECOVERY_ERR, &mddev->recovery)) 3595 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
@@ -3386,7 +3603,7 @@ static void md_do_sync(mddev_t *mddev)
3386 mddev->resync_mark = mark[next]; 3603 mddev->resync_mark = mark[next];
3387 mddev->resync_mark_cnt = mark_cnt[next]; 3604 mddev->resync_mark_cnt = mark_cnt[next];
3388 mark[next] = jiffies; 3605 mark[next] = jiffies;
3389 mark_cnt[next] = j - atomic_read(&mddev->recovery_active); 3606 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
3390 last_mark = next; 3607 last_mark = next;
3391 } 3608 }
3392 3609
@@ -3413,7 +3630,8 @@ static void md_do_sync(mddev_t *mddev)
3413 mddev->queue->unplug_fn(mddev->queue); 3630 mddev->queue->unplug_fn(mddev->queue);
3414 cond_resched(); 3631 cond_resched();
3415 3632
3416 currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; 3633 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
3634 /((jiffies-mddev->resync_mark)/HZ +1) +1;
3417 3635
3418 if (currspeed > sysctl_speed_limit_min) { 3636 if (currspeed > sysctl_speed_limit_min) {
3419 if ((currspeed > sysctl_speed_limit_max) || 3637 if ((currspeed > sysctl_speed_limit_max) ||
@@ -3433,7 +3651,7 @@ static void md_do_sync(mddev_t *mddev)
3433 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 3651 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3434 3652
3435 /* tell personality that we are finished */ 3653 /* tell personality that we are finished */
3436 mddev->pers->sync_request(mddev, max_sectors, 1); 3654 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
3437 3655
3438 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && 3656 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3439 mddev->curr_resync > 2 && 3657 mddev->curr_resync > 2 &&
@@ -3447,7 +3665,6 @@ static void md_do_sync(mddev_t *mddev)
3447 mddev->recovery_cp = MaxSector; 3665 mddev->recovery_cp = MaxSector;
3448 } 3666 }
3449 3667
3450 md_enter_safemode(mddev);
3451 skip: 3668 skip:
3452 mddev->curr_resync = 0; 3669 mddev->curr_resync = 0;
3453 wake_up(&resync_wait); 3670 wake_up(&resync_wait);
@@ -3484,20 +3701,48 @@ void md_check_recovery(mddev_t *mddev)
3484 struct list_head *rtmp; 3701 struct list_head *rtmp;
3485 3702
3486 3703
3487 dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); 3704 if (mddev->bitmap)
3705 bitmap_daemon_work(mddev->bitmap);
3488 3706
3489 if (mddev->ro) 3707 if (mddev->ro)
3490 return; 3708 return;
3709
3710 if (signal_pending(current)) {
3711 if (mddev->pers->sync_request) {
3712 printk(KERN_INFO "md: %s in immediate safe mode\n",
3713 mdname(mddev));
3714 mddev->safemode = 2;
3715 }
3716 flush_signals(current);
3717 }
3718
3491 if ( ! ( 3719 if ( ! (
3492 mddev->sb_dirty || 3720 mddev->sb_dirty ||
3493 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 3721 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
3494 test_bit(MD_RECOVERY_DONE, &mddev->recovery) 3722 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
3723 (mddev->safemode == 1) ||
3724 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
3725 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
3495 )) 3726 ))
3496 return; 3727 return;
3728
3497 if (mddev_trylock(mddev)==0) { 3729 if (mddev_trylock(mddev)==0) {
3498 int spares =0; 3730 int spares =0;
3731
3732 spin_lock(&mddev->write_lock);
3733 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3734 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3735 mddev->in_sync = 1;
3736 mddev->sb_dirty = 1;
3737 }
3738 if (mddev->safemode == 1)
3739 mddev->safemode = 0;
3740 spin_unlock(&mddev->write_lock);
3741
3499 if (mddev->sb_dirty) 3742 if (mddev->sb_dirty)
3500 md_update_sb(mddev); 3743 md_update_sb(mddev);
3744
3745
3501 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 3746 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3502 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 3747 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3503 /* resync/recovery still happening */ 3748 /* resync/recovery still happening */
@@ -3515,6 +3760,14 @@ void md_check_recovery(mddev_t *mddev)
3515 mddev->pers->spare_active(mddev); 3760 mddev->pers->spare_active(mddev);
3516 } 3761 }
3517 md_update_sb(mddev); 3762 md_update_sb(mddev);
3763
3764 /* if array is no-longer degraded, then any saved_raid_disk
3765 * information must be scrapped
3766 */
3767 if (!mddev->degraded)
3768 ITERATE_RDEV(mddev,rdev,rtmp)
3769 rdev->saved_raid_disk = -1;
3770
3518 mddev->recovery = 0; 3771 mddev->recovery = 0;
3519 /* flag recovery needed just to double check */ 3772 /* flag recovery needed just to double check */
3520 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3773 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -3557,6 +3810,13 @@ void md_check_recovery(mddev_t *mddev)
3557 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3810 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3558 if (!spares) 3811 if (!spares)
3559 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3812 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3813 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
3814 /* We are adding a device or devices to an array
3815 * which has the bitmap stored on all devices.
3816 * So make sure all bitmap pages get written
3817 */
3818 bitmap_write_all(mddev->bitmap);
3819 }
3560 mddev->sync_thread = md_register_thread(md_do_sync, 3820 mddev->sync_thread = md_register_thread(md_do_sync,
3561 mddev, 3821 mddev,
3562 "%s_resync"); 3822 "%s_resync");
@@ -3624,6 +3884,8 @@ static int __init md_init(void)
3624 " MD_SB_DISKS=%d\n", 3884 " MD_SB_DISKS=%d\n",
3625 MD_MAJOR_VERSION, MD_MINOR_VERSION, 3885 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3626 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); 3886 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3887 printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR,
3888 BITMAP_MINOR);
3627 3889
3628 if (register_blkdev(MAJOR_NR, "md")) 3890 if (register_blkdev(MAJOR_NR, "md"))
3629 return -1; 3891 return -1;
@@ -3739,7 +4001,6 @@ EXPORT_SYMBOL(md_error);
3739EXPORT_SYMBOL(md_done_sync); 4001EXPORT_SYMBOL(md_done_sync);
3740EXPORT_SYMBOL(md_write_start); 4002EXPORT_SYMBOL(md_write_start);
3741EXPORT_SYMBOL(md_write_end); 4003EXPORT_SYMBOL(md_write_end);
3742EXPORT_SYMBOL(md_handle_safemode);
3743EXPORT_SYMBOL(md_register_thread); 4004EXPORT_SYMBOL(md_register_thread);
3744EXPORT_SYMBOL(md_unregister_thread); 4005EXPORT_SYMBOL(md_unregister_thread);
3745EXPORT_SYMBOL(md_wakeup_thread); 4006EXPORT_SYMBOL(md_wakeup_thread);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 2ae2d709cb15..2d2ca7fa0265 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -533,8 +533,7 @@ static int multipath_run (mddev_t *mddev)
533out_free_conf: 533out_free_conf:
534 if (conf->pool) 534 if (conf->pool)
535 mempool_destroy(conf->pool); 535 mempool_destroy(conf->pool);
536 if (conf->multipaths) 536 kfree(conf->multipaths);
537 kfree(conf->multipaths);
538 kfree(conf); 537 kfree(conf);
539 mddev->private = NULL; 538 mddev->private = NULL;
540out: 539out:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e7d934eca06f..e11dd14d0b43 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -371,10 +371,8 @@ static int raid0_run (mddev_t *mddev)
371 return 0; 371 return 0;
372 372
373out_free_conf: 373out_free_conf:
374 if (conf->strip_zone) 374 kfree(conf->strip_zone);
375 kfree(conf->strip_zone); 375 kfree(conf->devlist);
376 if (conf->devlist)
377 kfree (conf->devlist);
378 kfree(conf); 376 kfree(conf);
379 mddev->private = NULL; 377 mddev->private = NULL;
380out: 378out:
@@ -386,11 +384,11 @@ static int raid0_stop (mddev_t *mddev)
386 raid0_conf_t *conf = mddev_to_conf(mddev); 384 raid0_conf_t *conf = mddev_to_conf(mddev);
387 385
388 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 386 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
389 kfree (conf->hash_table); 387 kfree(conf->hash_table);
390 conf->hash_table = NULL; 388 conf->hash_table = NULL;
391 kfree (conf->strip_zone); 389 kfree(conf->strip_zone);
392 conf->strip_zone = NULL; 390 conf->strip_zone = NULL;
393 kfree (conf); 391 kfree(conf);
394 mddev->private = NULL; 392 mddev->private = NULL;
395 393
396 return 0; 394 return 0;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1db5de52d376..ff1dbec864af 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -12,6 +12,15 @@
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 * 14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
15 * This program is free software; you can redistribute it and/or modify 24 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 25 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option) 26 * the Free Software Foundation; either version 2, or (at your option)
@@ -22,7 +31,16 @@
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 32 */
24 33
34#include "dm-bio-list.h"
25#include <linux/raid/raid1.h> 35#include <linux/raid/raid1.h>
36#include <linux/raid/bitmap.h>
37
38#define DEBUG 0
39#if DEBUG
40#define PRINTK(x...) printk(x)
41#else
42#define PRINTK(x...)
43#endif
26 44
27/* 45/*
28 * Number of guaranteed r1bios in case of extreme VM load: 46 * Number of guaranteed r1bios in case of extreme VM load:
@@ -287,9 +305,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
287 /* 305 /*
288 * this branch is our 'one mirror IO has finished' event handler: 306 * this branch is our 'one mirror IO has finished' event handler:
289 */ 307 */
290 if (!uptodate) 308 if (!uptodate) {
291 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 309 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
292 else 310 /* an I/O failed, we can't clear the bitmap */
311 set_bit(R1BIO_Degraded, &r1_bio->state);
312 } else
293 /* 313 /*
294 * Set R1BIO_Uptodate in our master bio, so that 314 * Set R1BIO_Uptodate in our master bio, so that
295 * we will return a good error code for to the higher 315 * we will return a good error code for to the higher
@@ -309,6 +329,10 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
309 * already. 329 * already.
310 */ 330 */
311 if (atomic_dec_and_test(&r1_bio->remaining)) { 331 if (atomic_dec_and_test(&r1_bio->remaining)) {
332 /* clear the bitmap if all writes complete successfully */
333 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
334 r1_bio->sectors,
335 !test_bit(R1BIO_Degraded, &r1_bio->state));
312 md_write_end(r1_bio->mddev); 336 md_write_end(r1_bio->mddev);
313 raid_end_bio_io(r1_bio); 337 raid_end_bio_io(r1_bio);
314 } 338 }
@@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev)
458 482
459static void raid1_unplug(request_queue_t *q) 483static void raid1_unplug(request_queue_t *q)
460{ 484{
461 unplug_slaves(q->queuedata); 485 mddev_t *mddev = q->queuedata;
486
487 unplug_slaves(mddev);
488 md_wakeup_thread(mddev->thread);
462} 489}
463 490
464static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 491static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, sector_t sect)
501{ 528{
502 spin_lock_irq(&conf->resync_lock); 529 spin_lock_irq(&conf->resync_lock);
503 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), 530 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
504 conf->resync_lock, unplug_slaves(conf->mddev)); 531 conf->resync_lock, raid1_unplug(conf->mddev->queue));
505 532
506 if (!conf->barrier++) { 533 if (!conf->barrier++) {
507 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 534 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
508 conf->resync_lock, unplug_slaves(conf->mddev)); 535 conf->resync_lock, raid1_unplug(conf->mddev->queue));
509 if (conf->nr_pending) 536 if (conf->nr_pending)
510 BUG(); 537 BUG();
511 } 538 }
512 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, 539 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
513 conf->resync_lock, unplug_slaves(conf->mddev)); 540 conf->resync_lock, raid1_unplug(conf->mddev->queue));
514 conf->next_resync = sect; 541 conf->next_resync = sect;
515 spin_unlock_irq(&conf->resync_lock); 542 spin_unlock_irq(&conf->resync_lock);
516} 543}
@@ -522,14 +549,20 @@ static int make_request(request_queue_t *q, struct bio * bio)
522 mirror_info_t *mirror; 549 mirror_info_t *mirror;
523 r1bio_t *r1_bio; 550 r1bio_t *r1_bio;
524 struct bio *read_bio; 551 struct bio *read_bio;
525 int i, disks; 552 int i, targets = 0, disks;
526 mdk_rdev_t *rdev; 553 mdk_rdev_t *rdev;
554 struct bitmap *bitmap = mddev->bitmap;
555 unsigned long flags;
556 struct bio_list bl;
557
527 558
528 /* 559 /*
529 * Register the new request and wait if the reconstruction 560 * Register the new request and wait if the reconstruction
530 * thread has put up a bar for new requests. 561 * thread has put up a bar for new requests.
531 * Continue immediately if no resync is active currently. 562 * Continue immediately if no resync is active currently.
532 */ 563 */
564 md_write_start(mddev, bio); /* wait on superblock update early */
565
533 spin_lock_irq(&conf->resync_lock); 566 spin_lock_irq(&conf->resync_lock);
534 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); 567 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
535 conf->nr_pending++; 568 conf->nr_pending++;
@@ -552,7 +585,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
552 585
553 r1_bio->master_bio = bio; 586 r1_bio->master_bio = bio;
554 r1_bio->sectors = bio->bi_size >> 9; 587 r1_bio->sectors = bio->bi_size >> 9;
555 588 r1_bio->state = 0;
556 r1_bio->mddev = mddev; 589 r1_bio->mddev = mddev;
557 r1_bio->sector = bio->bi_sector; 590 r1_bio->sector = bio->bi_sector;
558 591
@@ -595,6 +628,13 @@ static int make_request(request_queue_t *q, struct bio * bio)
595 * bios[x] to bio 628 * bios[x] to bio
596 */ 629 */
597 disks = conf->raid_disks; 630 disks = conf->raid_disks;
631#if 0
632 { static int first=1;
633 if (first) printk("First Write sector %llu disks %d\n",
634 (unsigned long long)r1_bio->sector, disks);
635 first = 0;
636 }
637#endif
598 rcu_read_lock(); 638 rcu_read_lock();
599 for (i = 0; i < disks; i++) { 639 for (i = 0; i < disks; i++) {
600 if ((rdev=conf->mirrors[i].rdev) != NULL && 640 if ((rdev=conf->mirrors[i].rdev) != NULL &&
@@ -605,13 +645,21 @@ static int make_request(request_queue_t *q, struct bio * bio)
605 r1_bio->bios[i] = NULL; 645 r1_bio->bios[i] = NULL;
606 } else 646 } else
607 r1_bio->bios[i] = bio; 647 r1_bio->bios[i] = bio;
648 targets++;
608 } else 649 } else
609 r1_bio->bios[i] = NULL; 650 r1_bio->bios[i] = NULL;
610 } 651 }
611 rcu_read_unlock(); 652 rcu_read_unlock();
612 653
613 atomic_set(&r1_bio->remaining, 1); 654 if (targets < conf->raid_disks) {
614 md_write_start(mddev); 655 /* array is degraded, we will not clear the bitmap
656 * on I/O completion (see raid1_end_write_request) */
657 set_bit(R1BIO_Degraded, &r1_bio->state);
658 }
659
660 atomic_set(&r1_bio->remaining, 0);
661
662 bio_list_init(&bl);
615 for (i = 0; i < disks; i++) { 663 for (i = 0; i < disks; i++) {
616 struct bio *mbio; 664 struct bio *mbio;
617 if (!r1_bio->bios[i]) 665 if (!r1_bio->bios[i])
@@ -627,14 +675,23 @@ static int make_request(request_queue_t *q, struct bio * bio)
627 mbio->bi_private = r1_bio; 675 mbio->bi_private = r1_bio;
628 676
629 atomic_inc(&r1_bio->remaining); 677 atomic_inc(&r1_bio->remaining);
630 generic_make_request(mbio);
631 }
632 678
633 if (atomic_dec_and_test(&r1_bio->remaining)) { 679 bio_list_add(&bl, mbio);
634 md_write_end(mddev);
635 raid_end_bio_io(r1_bio);
636 } 680 }
637 681
682 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors);
683 spin_lock_irqsave(&conf->device_lock, flags);
684 bio_list_merge(&conf->pending_bio_list, &bl);
685 bio_list_init(&bl);
686
687 blk_plug_device(mddev->queue);
688 spin_unlock_irqrestore(&conf->device_lock, flags);
689
690#if 0
691 while ((bio = bio_list_pop(&bl)) != NULL)
692 generic_make_request(bio);
693#endif
694
638 return 0; 695 return 0;
639} 696}
640 697
@@ -714,7 +771,7 @@ static void close_sync(conf_t *conf)
714{ 771{
715 spin_lock_irq(&conf->resync_lock); 772 spin_lock_irq(&conf->resync_lock);
716 wait_event_lock_irq(conf->wait_resume, !conf->barrier, 773 wait_event_lock_irq(conf->wait_resume, !conf->barrier,
717 conf->resync_lock, unplug_slaves(conf->mddev)); 774 conf->resync_lock, raid1_unplug(conf->mddev->queue));
718 spin_unlock_irq(&conf->resync_lock); 775 spin_unlock_irq(&conf->resync_lock);
719 776
720 if (conf->barrier) BUG(); 777 if (conf->barrier) BUG();
@@ -754,9 +811,12 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
754{ 811{
755 conf_t *conf = mddev->private; 812 conf_t *conf = mddev->private;
756 int found = 0; 813 int found = 0;
757 int mirror; 814 int mirror = 0;
758 mirror_info_t *p; 815 mirror_info_t *p;
759 816
817 if (rdev->saved_raid_disk >= 0 &&
818 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
819 mirror = rdev->saved_raid_disk;
760 for (mirror=0; mirror < mddev->raid_disks; mirror++) 820 for (mirror=0; mirror < mddev->raid_disks; mirror++)
761 if ( !(p=conf->mirrors+mirror)->rdev) { 821 if ( !(p=conf->mirrors+mirror)->rdev) {
762 822
@@ -773,6 +833,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
773 p->head_position = 0; 833 p->head_position = 0;
774 rdev->raid_disk = mirror; 834 rdev->raid_disk = mirror;
775 found = 1; 835 found = 1;
836 if (rdev->saved_raid_disk != mirror)
837 conf->fullsync = 1;
776 p->rdev = rdev; 838 p->rdev = rdev;
777 break; 839 break;
778 } 840 }
@@ -828,10 +890,11 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
828 * or re-read if the read failed. 890 * or re-read if the read failed.
829 * We don't do much here, just schedule handling by raid1d 891 * We don't do much here, just schedule handling by raid1d
830 */ 892 */
831 if (!uptodate) 893 if (!uptodate) {
832 md_error(r1_bio->mddev, 894 md_error(r1_bio->mddev,
833 conf->mirrors[r1_bio->read_disk].rdev); 895 conf->mirrors[r1_bio->read_disk].rdev);
834 else 896 set_bit(R1BIO_Degraded, &r1_bio->state);
897 } else
835 set_bit(R1BIO_Uptodate, &r1_bio->state); 898 set_bit(R1BIO_Uptodate, &r1_bio->state);
836 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); 899 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
837 reschedule_retry(r1_bio); 900 reschedule_retry(r1_bio);
@@ -855,8 +918,10 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
855 mirror = i; 918 mirror = i;
856 break; 919 break;
857 } 920 }
858 if (!uptodate) 921 if (!uptodate) {
859 md_error(mddev, conf->mirrors[mirror].rdev); 922 md_error(mddev, conf->mirrors[mirror].rdev);
923 set_bit(R1BIO_Degraded, &r1_bio->state);
924 }
860 update_head_pos(mirror, r1_bio); 925 update_head_pos(mirror, r1_bio);
861 926
862 if (atomic_dec_and_test(&r1_bio->remaining)) { 927 if (atomic_dec_and_test(&r1_bio->remaining)) {
@@ -876,6 +941,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
876 941
877 bio = r1_bio->bios[r1_bio->read_disk]; 942 bio = r1_bio->bios[r1_bio->read_disk];
878 943
944/*
945 if (r1_bio->sector == 0) printk("First sync write startss\n");
946*/
879 /* 947 /*
880 * schedule writes 948 * schedule writes
881 */ 949 */
@@ -903,10 +971,12 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
903 atomic_inc(&conf->mirrors[i].rdev->nr_pending); 971 atomic_inc(&conf->mirrors[i].rdev->nr_pending);
904 atomic_inc(&r1_bio->remaining); 972 atomic_inc(&r1_bio->remaining);
905 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 973 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
974
906 generic_make_request(wbio); 975 generic_make_request(wbio);
907 } 976 }
908 977
909 if (atomic_dec_and_test(&r1_bio->remaining)) { 978 if (atomic_dec_and_test(&r1_bio->remaining)) {
979 /* if we're here, all write(s) have completed, so clean up */
910 md_done_sync(mddev, r1_bio->sectors, 1); 980 md_done_sync(mddev, r1_bio->sectors, 1);
911 put_buf(r1_bio); 981 put_buf(r1_bio);
912 } 982 }
@@ -931,11 +1001,30 @@ static void raid1d(mddev_t *mddev)
931 mdk_rdev_t *rdev; 1001 mdk_rdev_t *rdev;
932 1002
933 md_check_recovery(mddev); 1003 md_check_recovery(mddev);
934 md_handle_safemode(mddev);
935 1004
936 for (;;) { 1005 for (;;) {
937 char b[BDEVNAME_SIZE]; 1006 char b[BDEVNAME_SIZE];
938 spin_lock_irqsave(&conf->device_lock, flags); 1007 spin_lock_irqsave(&conf->device_lock, flags);
1008
1009 if (conf->pending_bio_list.head) {
1010 bio = bio_list_get(&conf->pending_bio_list);
1011 blk_remove_plug(mddev->queue);
1012 spin_unlock_irqrestore(&conf->device_lock, flags);
1013 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1014 if (bitmap_unplug(mddev->bitmap) != 0)
1015 printk("%s: bitmap file write failed!\n", mdname(mddev));
1016
1017 while (bio) { /* submit pending writes */
1018 struct bio *next = bio->bi_next;
1019 bio->bi_next = NULL;
1020 generic_make_request(bio);
1021 bio = next;
1022 }
1023 unplug = 1;
1024
1025 continue;
1026 }
1027
939 if (list_empty(head)) 1028 if (list_empty(head))
940 break; 1029 break;
941 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1030 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
@@ -1009,7 +1098,7 @@ static int init_resync(conf_t *conf)
1009 * that can be installed to exclude normal IO requests. 1098 * that can be installed to exclude normal IO requests.
1010 */ 1099 */
1011 1100
1012static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) 1101static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1013{ 1102{
1014 conf_t *conf = mddev_to_conf(mddev); 1103 conf_t *conf = mddev_to_conf(mddev);
1015 mirror_info_t *mirror; 1104 mirror_info_t *mirror;
@@ -1019,17 +1108,43 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1019 int disk; 1108 int disk;
1020 int i; 1109 int i;
1021 int write_targets = 0; 1110 int write_targets = 0;
1111 int sync_blocks;
1022 1112
1023 if (!conf->r1buf_pool) 1113 if (!conf->r1buf_pool)
1114 {
1115/*
1116 printk("sync start - bitmap %p\n", mddev->bitmap);
1117*/
1024 if (init_resync(conf)) 1118 if (init_resync(conf))
1025 return -ENOMEM; 1119 return 0;
1120 }
1026 1121
1027 max_sector = mddev->size << 1; 1122 max_sector = mddev->size << 1;
1028 if (sector_nr >= max_sector) { 1123 if (sector_nr >= max_sector) {
1124 /* If we aborted, we need to abort the
1125 * sync on the 'current' bitmap chunk (there will
1126 * only be one in raid1 resync.
1127 * We can find the current addess in mddev->curr_resync
1128 */
1129 if (!conf->fullsync) {
1130 if (mddev->curr_resync < max_sector)
1131 bitmap_end_sync(mddev->bitmap,
1132 mddev->curr_resync,
1133 &sync_blocks, 1);
1134 bitmap_close_sync(mddev->bitmap);
1135 }
1136 if (mddev->curr_resync >= max_sector)
1137 conf->fullsync = 0;
1029 close_sync(conf); 1138 close_sync(conf);
1030 return 0; 1139 return 0;
1031 } 1140 }
1032 1141
1142 if (!conf->fullsync &&
1143 !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) {
1144 /* We can skip this block, and probably several more */
1145 *skipped = 1;
1146 return sync_blocks;
1147 }
1033 /* 1148 /*
1034 * If there is non-resync activity waiting for us then 1149 * If there is non-resync activity waiting for us then
1035 * put in a delay to throttle resync. 1150 * put in a delay to throttle resync.
@@ -1068,6 +1183,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1068 1183
1069 r1_bio->mddev = mddev; 1184 r1_bio->mddev = mddev;
1070 r1_bio->sector = sector_nr; 1185 r1_bio->sector = sector_nr;
1186 r1_bio->state = 0;
1071 set_bit(R1BIO_IsSync, &r1_bio->state); 1187 set_bit(R1BIO_IsSync, &r1_bio->state);
1072 r1_bio->read_disk = disk; 1188 r1_bio->read_disk = disk;
1073 1189
@@ -1102,18 +1218,24 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1102 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1218 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1103 bio->bi_private = r1_bio; 1219 bio->bi_private = r1_bio;
1104 } 1220 }
1221
1222 if (write_targets + 1 < conf->raid_disks)
1223 /* array degraded, can't clear bitmap */
1224 set_bit(R1BIO_Degraded, &r1_bio->state);
1225
1105 if (write_targets == 0) { 1226 if (write_targets == 0) {
1106 /* There is nowhere to write, so all non-sync 1227 /* There is nowhere to write, so all non-sync
1107 * drives must be failed - so we are finished 1228 * drives must be failed - so we are finished
1108 */ 1229 */
1109 int rv = max_sector - sector_nr; 1230 sector_t rv = max_sector - sector_nr;
1110 md_done_sync(mddev, rv, 1); 1231 *skipped = 1;
1111 put_buf(r1_bio); 1232 put_buf(r1_bio);
1112 rdev_dec_pending(conf->mirrors[disk].rdev, mddev); 1233 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1113 return rv; 1234 return rv;
1114 } 1235 }
1115 1236
1116 nr_sectors = 0; 1237 nr_sectors = 0;
1238 sync_blocks = 0;
1117 do { 1239 do {
1118 struct page *page; 1240 struct page *page;
1119 int len = PAGE_SIZE; 1241 int len = PAGE_SIZE;
@@ -1121,6 +1243,17 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1121 len = (max_sector - sector_nr) << 9; 1243 len = (max_sector - sector_nr) << 9;
1122 if (len == 0) 1244 if (len == 0)
1123 break; 1245 break;
1246 if (!conf->fullsync) {
1247 if (sync_blocks == 0) {
1248 if (!bitmap_start_sync(mddev->bitmap,
1249 sector_nr, &sync_blocks))
1250 break;
1251 if (sync_blocks < (PAGE_SIZE>>9))
1252 BUG();
1253 if (len > (sync_blocks<<9)) len = sync_blocks<<9;
1254 }
1255 }
1256
1124 for (i=0 ; i < conf->raid_disks; i++) { 1257 for (i=0 ; i < conf->raid_disks; i++) {
1125 bio = r1_bio->bios[i]; 1258 bio = r1_bio->bios[i];
1126 if (bio->bi_end_io) { 1259 if (bio->bi_end_io) {
@@ -1143,6 +1276,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1143 } 1276 }
1144 nr_sectors += len>>9; 1277 nr_sectors += len>>9;
1145 sector_nr += len>>9; 1278 sector_nr += len>>9;
1279 sync_blocks -= (len>>9);
1146 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1280 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1147 bio_full: 1281 bio_full:
1148 bio = r1_bio->bios[disk]; 1282 bio = r1_bio->bios[disk];
@@ -1231,6 +1365,9 @@ static int run(mddev_t *mddev)
1231 init_waitqueue_head(&conf->wait_idle); 1365 init_waitqueue_head(&conf->wait_idle);
1232 init_waitqueue_head(&conf->wait_resume); 1366 init_waitqueue_head(&conf->wait_resume);
1233 1367
1368 bio_list_init(&conf->pending_bio_list);
1369 bio_list_init(&conf->flushing_bio_list);
1370
1234 if (!conf->working_disks) { 1371 if (!conf->working_disks) {
1235 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1372 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1236 mdname(mddev)); 1373 mdname(mddev));
@@ -1259,16 +1396,15 @@ static int run(mddev_t *mddev)
1259 conf->last_used = j; 1396 conf->last_used = j;
1260 1397
1261 1398
1262 1399 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1263 { 1400 if (!mddev->thread) {
1264 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1401 printk(KERN_ERR
1265 if (!mddev->thread) { 1402 "raid1: couldn't allocate thread for %s\n",
1266 printk(KERN_ERR 1403 mdname(mddev));
1267 "raid1: couldn't allocate thread for %s\n", 1404 goto out_free_conf;
1268 mdname(mddev));
1269 goto out_free_conf;
1270 }
1271 } 1405 }
1406 if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
1407
1272 printk(KERN_INFO 1408 printk(KERN_INFO
1273 "raid1: raid set %s active with %d out of %d mirrors\n", 1409 "raid1: raid set %s active with %d out of %d mirrors\n",
1274 mdname(mddev), mddev->raid_disks - mddev->degraded, 1410 mdname(mddev), mddev->raid_disks - mddev->degraded,
@@ -1291,10 +1427,8 @@ out_free_conf:
1291 if (conf) { 1427 if (conf) {
1292 if (conf->r1bio_pool) 1428 if (conf->r1bio_pool)
1293 mempool_destroy(conf->r1bio_pool); 1429 mempool_destroy(conf->r1bio_pool);
1294 if (conf->mirrors) 1430 kfree(conf->mirrors);
1295 kfree(conf->mirrors); 1431 kfree(conf->poolinfo);
1296 if (conf->poolinfo)
1297 kfree(conf->poolinfo);
1298 kfree(conf); 1432 kfree(conf);
1299 mddev->private = NULL; 1433 mddev->private = NULL;
1300 } 1434 }
@@ -1311,10 +1445,8 @@ static int stop(mddev_t *mddev)
1311 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1445 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1312 if (conf->r1bio_pool) 1446 if (conf->r1bio_pool)
1313 mempool_destroy(conf->r1bio_pool); 1447 mempool_destroy(conf->r1bio_pool);
1314 if (conf->mirrors) 1448 kfree(conf->mirrors);
1315 kfree(conf->mirrors); 1449 kfree(conf->poolinfo);
1316 if (conf->poolinfo)
1317 kfree(conf->poolinfo);
1318 kfree(conf); 1450 kfree(conf);
1319 mddev->private = NULL; 1451 mddev->private = NULL;
1320 return 0; 1452 return 0;
@@ -1349,17 +1481,26 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1349 * We allocate a new r1bio_pool if we can. 1481 * We allocate a new r1bio_pool if we can.
1350 * Then raise a device barrier and wait until all IO stops. 1482 * Then raise a device barrier and wait until all IO stops.
1351 * Then resize conf->mirrors and swap in the new r1bio pool. 1483 * Then resize conf->mirrors and swap in the new r1bio pool.
1484 *
1485 * At the same time, we "pack" the devices so that all the missing
1486 * devices have the higher raid_disk numbers.
1352 */ 1487 */
1353 mempool_t *newpool, *oldpool; 1488 mempool_t *newpool, *oldpool;
1354 struct pool_info *newpoolinfo; 1489 struct pool_info *newpoolinfo;
1355 mirror_info_t *newmirrors; 1490 mirror_info_t *newmirrors;
1356 conf_t *conf = mddev_to_conf(mddev); 1491 conf_t *conf = mddev_to_conf(mddev);
1492 int cnt;
1357 1493
1358 int d; 1494 int d, d2;
1359 1495
1360 for (d= raid_disks; d < conf->raid_disks; d++) 1496 if (raid_disks < conf->raid_disks) {
1361 if (conf->mirrors[d].rdev) 1497 cnt=0;
1498 for (d= 0; d < conf->raid_disks; d++)
1499 if (conf->mirrors[d].rdev)
1500 cnt++;
1501 if (cnt > raid_disks)
1362 return -EBUSY; 1502 return -EBUSY;
1503 }
1363 1504
1364 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 1505 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
1365 if (!newpoolinfo) 1506 if (!newpoolinfo)
@@ -1384,14 +1525,18 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1384 spin_lock_irq(&conf->resync_lock); 1525 spin_lock_irq(&conf->resync_lock);
1385 conf->barrier++; 1526 conf->barrier++;
1386 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, 1527 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1387 conf->resync_lock, unplug_slaves(mddev)); 1528 conf->resync_lock, raid1_unplug(mddev->queue));
1388 spin_unlock_irq(&conf->resync_lock); 1529 spin_unlock_irq(&conf->resync_lock);
1389 1530
1390 /* ok, everything is stopped */ 1531 /* ok, everything is stopped */
1391 oldpool = conf->r1bio_pool; 1532 oldpool = conf->r1bio_pool;
1392 conf->r1bio_pool = newpool; 1533 conf->r1bio_pool = newpool;
1393 for (d=0; d < raid_disks && d < conf->raid_disks; d++) 1534
1394 newmirrors[d] = conf->mirrors[d]; 1535 for (d=d2=0; d < conf->raid_disks; d++)
1536 if (conf->mirrors[d].rdev) {
1537 conf->mirrors[d].rdev->raid_disk = d2;
1538 newmirrors[d2++].rdev = conf->mirrors[d].rdev;
1539 }
1395 kfree(conf->mirrors); 1540 kfree(conf->mirrors);
1396 conf->mirrors = newmirrors; 1541 conf->mirrors = newmirrors;
1397 kfree(conf->poolinfo); 1542 kfree(conf->poolinfo);
@@ -1400,6 +1545,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
1400 mddev->degraded += (raid_disks - conf->raid_disks); 1545 mddev->degraded += (raid_disks - conf->raid_disks);
1401 conf->raid_disks = mddev->raid_disks = raid_disks; 1546 conf->raid_disks = mddev->raid_disks = raid_disks;
1402 1547
1548 conf->last_used = 0; /* just make sure it is in-range */
1403 spin_lock_irq(&conf->resync_lock); 1549 spin_lock_irq(&conf->resync_lock);
1404 conf->barrier--; 1550 conf->barrier--;
1405 spin_unlock_irq(&conf->resync_lock); 1551 spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3c37be6423d7..62ebb1bc72be 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -700,6 +700,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
700 return 0; 700 return 0;
701 } 701 }
702 702
703 md_write_start(mddev, bio);
704
703 /* 705 /*
704 * Register the new request and wait if the reconstruction 706 * Register the new request and wait if the reconstruction
705 * thread has put up a bar for new requests. 707 * thread has put up a bar for new requests.
@@ -774,7 +776,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
774 rcu_read_unlock(); 776 rcu_read_unlock();
775 777
776 atomic_set(&r10_bio->remaining, 1); 778 atomic_set(&r10_bio->remaining, 1);
777 md_write_start(mddev); 779
778 for (i = 0; i < conf->copies; i++) { 780 for (i = 0; i < conf->copies; i++) {
779 struct bio *mbio; 781 struct bio *mbio;
780 int d = r10_bio->devs[i].devnum; 782 int d = r10_bio->devs[i].devnum;
@@ -1216,7 +1218,6 @@ static void raid10d(mddev_t *mddev)
1216 mdk_rdev_t *rdev; 1218 mdk_rdev_t *rdev;
1217 1219
1218 md_check_recovery(mddev); 1220 md_check_recovery(mddev);
1219 md_handle_safemode(mddev);
1220 1221
1221 for (;;) { 1222 for (;;) {
1222 char b[BDEVNAME_SIZE]; 1223 char b[BDEVNAME_SIZE];
@@ -1319,7 +1320,7 @@ static int init_resync(conf_t *conf)
1319 * 1320 *
1320 */ 1321 */
1321 1322
1322static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) 1323static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1323{ 1324{
1324 conf_t *conf = mddev_to_conf(mddev); 1325 conf_t *conf = mddev_to_conf(mddev);
1325 r10bio_t *r10_bio; 1326 r10bio_t *r10_bio;
@@ -1333,7 +1334,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1333 1334
1334 if (!conf->r10buf_pool) 1335 if (!conf->r10buf_pool)
1335 if (init_resync(conf)) 1336 if (init_resync(conf))
1336 return -ENOMEM; 1337 return 0;
1337 1338
1338 skipped: 1339 skipped:
1339 max_sector = mddev->size << 1; 1340 max_sector = mddev->size << 1;
@@ -1341,15 +1342,15 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1341 max_sector = mddev->resync_max_sectors; 1342 max_sector = mddev->resync_max_sectors;
1342 if (sector_nr >= max_sector) { 1343 if (sector_nr >= max_sector) {
1343 close_sync(conf); 1344 close_sync(conf);
1345 *skipped = 1;
1344 return sectors_skipped; 1346 return sectors_skipped;
1345 } 1347 }
1346 if (chunks_skipped >= conf->raid_disks) { 1348 if (chunks_skipped >= conf->raid_disks) {
1347 /* if there has been nothing to do on any drive, 1349 /* if there has been nothing to do on any drive,
1348 * then there is nothing to do at all.. 1350 * then there is nothing to do at all..
1349 */ 1351 */
1350 sector_t sec = max_sector - sector_nr; 1352 *skipped = 1;
1351 md_done_sync(mddev, sec, 1); 1353 return (max_sector - sector_nr) + sectors_skipped;
1352 return sec + sectors_skipped;
1353 } 1354 }
1354 1355
1355 /* make sure whole request will fit in a chunk - if chunks 1356 /* make sure whole request will fit in a chunk - if chunks
@@ -1563,17 +1564,22 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1563 } 1564 }
1564 } 1565 }
1565 1566
1567 if (sectors_skipped)
1568 /* pretend they weren't skipped, it makes
1569 * no important difference in this case
1570 */
1571 md_done_sync(mddev, sectors_skipped, 1);
1572
1566 return sectors_skipped + nr_sectors; 1573 return sectors_skipped + nr_sectors;
1567 giveup: 1574 giveup:
1568 /* There is nowhere to write, so all non-sync 1575 /* There is nowhere to write, so all non-sync
1569 * drives must be failed, so try the next chunk... 1576 * drives must be failed, so try the next chunk...
1570 */ 1577 */
1571 { 1578 {
1572 int sec = max_sector - sector_nr; 1579 sector_t sec = max_sector - sector_nr;
1573 sectors_skipped += sec; 1580 sectors_skipped += sec;
1574 chunks_skipped ++; 1581 chunks_skipped ++;
1575 sector_nr = max_sector; 1582 sector_nr = max_sector;
1576 md_done_sync(mddev, sec, 1);
1577 goto skipped; 1583 goto skipped;
1578 } 1584 }
1579} 1585}
@@ -1731,8 +1737,7 @@ static int run(mddev_t *mddev)
1731out_free_conf: 1737out_free_conf:
1732 if (conf->r10bio_pool) 1738 if (conf->r10bio_pool)
1733 mempool_destroy(conf->r10bio_pool); 1739 mempool_destroy(conf->r10bio_pool);
1734 if (conf->mirrors) 1740 kfree(conf->mirrors);
1735 kfree(conf->mirrors);
1736 kfree(conf); 1741 kfree(conf);
1737 mddev->private = NULL; 1742 mddev->private = NULL;
1738out: 1743out:
@@ -1748,8 +1753,7 @@ static int stop(mddev_t *mddev)
1748 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1753 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1749 if (conf->r10bio_pool) 1754 if (conf->r10bio_pool)
1750 mempool_destroy(conf->r10bio_pool); 1755 mempool_destroy(conf->r10bio_pool);
1751 if (conf->mirrors) 1756 kfree(conf->mirrors);
1752 kfree(conf->mirrors);
1753 kfree(conf); 1757 kfree(conf);
1754 mddev->private = NULL; 1758 mddev->private = NULL;
1755 return 0; 1759 return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3cb11ac232fa..93a9726cc2d6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1411,6 +1411,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
1411 sector_t logical_sector, last_sector; 1411 sector_t logical_sector, last_sector;
1412 struct stripe_head *sh; 1412 struct stripe_head *sh;
1413 1413
1414 md_write_start(mddev, bi);
1415
1414 if (bio_data_dir(bi)==WRITE) { 1416 if (bio_data_dir(bi)==WRITE) {
1415 disk_stat_inc(mddev->gendisk, writes); 1417 disk_stat_inc(mddev->gendisk, writes);
1416 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); 1418 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1423,8 +1425,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1423 last_sector = bi->bi_sector + (bi->bi_size>>9); 1425 last_sector = bi->bi_sector + (bi->bi_size>>9);
1424 bi->bi_next = NULL; 1426 bi->bi_next = NULL;
1425 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 1427 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1426 if ( bio_data_dir(bi) == WRITE ) 1428
1427 md_write_start(mddev);
1428 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1429 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1429 DEFINE_WAIT(w); 1430 DEFINE_WAIT(w);
1430 1431
@@ -1475,7 +1476,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1475} 1476}
1476 1477
1477/* FIXME go_faster isn't used */ 1478/* FIXME go_faster isn't used */
1478static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) 1479static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1479{ 1480{
1480 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1481 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1481 struct stripe_head *sh; 1482 struct stripe_head *sh;
@@ -1498,8 +1499,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1498 * nothing we can do. 1499 * nothing we can do.
1499 */ 1500 */
1500 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1501 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1501 int rv = (mddev->size << 1) - sector_nr; 1502 sector_t rv = (mddev->size << 1) - sector_nr;
1502 md_done_sync(mddev, rv, 1); 1503 *skipped = 1;
1503 return rv; 1504 return rv;
1504 } 1505 }
1505 1506
@@ -1546,7 +1547,6 @@ static void raid5d (mddev_t *mddev)
1546 PRINTK("+++ raid5d active\n"); 1547 PRINTK("+++ raid5d active\n");
1547 1548
1548 md_check_recovery(mddev); 1549 md_check_recovery(mddev);
1549 md_handle_safemode(mddev);
1550 1550
1551 handled = 0; 1551 handled = 0;
1552 spin_lock_irq(&conf->device_lock); 1552 spin_lock_irq(&conf->device_lock);
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 908edd78a792..f62ea1a73d0d 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1570,6 +1570,8 @@ static int make_request (request_queue_t *q, struct bio * bi)
1570 sector_t logical_sector, last_sector; 1570 sector_t logical_sector, last_sector;
1571 struct stripe_head *sh; 1571 struct stripe_head *sh;
1572 1572
1573 md_write_start(mddev, bi);
1574
1573 if (bio_data_dir(bi)==WRITE) { 1575 if (bio_data_dir(bi)==WRITE) {
1574 disk_stat_inc(mddev->gendisk, writes); 1576 disk_stat_inc(mddev->gendisk, writes);
1575 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); 1577 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1583,8 +1585,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1583 1585
1584 bi->bi_next = NULL; 1586 bi->bi_next = NULL;
1585 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 1587 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1586 if ( bio_data_dir(bi) == WRITE ) 1588
1587 md_write_start(mddev);
1588 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 1589 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1589 DEFINE_WAIT(w); 1590 DEFINE_WAIT(w);
1590 1591
@@ -1634,7 +1635,7 @@ static int make_request (request_queue_t *q, struct bio * bi)
1634} 1635}
1635 1636
1636/* FIXME go_faster isn't used */ 1637/* FIXME go_faster isn't used */
1637static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) 1638static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1638{ 1639{
1639 raid6_conf_t *conf = (raid6_conf_t *) mddev->private; 1640 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1640 struct stripe_head *sh; 1641 struct stripe_head *sh;
@@ -1657,8 +1658,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1657 * nothing we can do. 1658 * nothing we can do.
1658 */ 1659 */
1659 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 1660 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1660 int rv = (mddev->size << 1) - sector_nr; 1661 sector_t rv = (mddev->size << 1) - sector_nr;
1661 md_done_sync(mddev, rv, 1); 1662 *skipped = 1;
1662 return rv; 1663 return rv;
1663 } 1664 }
1664 1665
@@ -1705,7 +1706,6 @@ static void raid6d (mddev_t *mddev)
1705 PRINTK("+++ raid6d active\n"); 1706 PRINTK("+++ raid6d active\n");
1706 1707
1707 md_check_recovery(mddev); 1708 md_check_recovery(mddev);
1708 md_handle_safemode(mddev);
1709 1709
1710 handled = 0; 1710 handled = 0;
1711 spin_lock_irq(&conf->device_lock); 1711 spin_lock_irq(&conf->device_lock);