aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Makefile6
-rw-r--r--drivers/md/dm-emc.c2
-rw-r--r--drivers/md/dm-exception-store.c10
-rw-r--r--drivers/md/dm-io.c38
-rw-r--r--drivers/md/dm-io.h79
-rw-r--r--drivers/md/dm-kcopyd.c (renamed from drivers/md/kcopyd.c)298
-rw-r--r--drivers/md/dm-log.c254
-rw-r--r--drivers/md/dm-log.h131
-rw-r--r--drivers/md/dm-mpath-hp-sw.c1
-rw-r--r--drivers/md/dm-mpath-rdac.c1
-rw-r--r--drivers/md/dm-raid1.c132
-rw-r--r--drivers/md/dm-snap.c22
-rw-r--r--drivers/md/dm-snap.h4
-rw-r--r--drivers/md/dm-table.c47
-rw-r--r--drivers/md/dm-uevent.c22
-rw-r--r--drivers/md/dm.c16
-rw-r--r--drivers/md/dm.h98
-rw-r--r--drivers/md/kcopyd.h42
-rw-r--r--drivers/md/md.c129
-rw-r--r--drivers/md/multipath.c3
-rw-r--r--drivers/md/raid1.c31
-rw-r--r--drivers/md/raid10.c33
-rw-r--r--drivers/md/raid5.c191
-rw-r--r--drivers/md/raid6algos.c3
24 files changed, 748 insertions, 845 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index d9aa7edb8780..7be09eeea293 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -3,10 +3,10 @@
3# 3#
4 4
5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ 5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o kcopyd.o 6 dm-ioctl.o dm-io.o dm-kcopyd.o
7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o 7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o 8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o 9dm-mirror-objs := dm-raid1.o
10dm-rdac-objs := dm-mpath-rdac.o 10dm-rdac-objs := dm-mpath-rdac.o
11dm-hp-sw-objs := dm-mpath-hp-sw.o 11dm-hp-sw-objs := dm-mpath-hp-sw.o
12md-mod-objs := md.o bitmap.o 12md-mod-objs := md.o bitmap.o
@@ -39,7 +39,7 @@ obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
39obj-$(CONFIG_DM_MULTIPATH_HP) += dm-hp-sw.o 39obj-$(CONFIG_DM_MULTIPATH_HP) += dm-hp-sw.o
40obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o 40obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o
41obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o 41obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
42obj-$(CONFIG_DM_MIRROR) += dm-mirror.o 42obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o
43obj-$(CONFIG_DM_ZERO) += dm-zero.o 43obj-$(CONFIG_DM_ZERO) += dm-zero.o
44 44
45quiet_cmd_unroll = UNROLL $@ 45quiet_cmd_unroll = UNROLL $@
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
index 6b91b9ab1d41..3ea5ad4b7805 100644
--- a/drivers/md/dm-emc.c
+++ b/drivers/md/dm-emc.c
@@ -110,8 +110,6 @@ static struct request *get_failover_req(struct emc_handler *h,
110 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); 110 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
111 rq->sense_len = 0; 111 rq->sense_len = 0;
112 112
113 memset(&rq->cmd, 0, BLK_MAX_CDB);
114
115 rq->timeout = EMC_FAILOVER_TIMEOUT; 113 rq->timeout = EMC_FAILOVER_TIMEOUT;
116 rq->cmd_type = REQ_TYPE_BLOCK_PC; 114 rq->cmd_type = REQ_TYPE_BLOCK_PC;
117 rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE; 115 rq->cmd_flags |= REQ_FAILFAST | REQ_NOMERGE;
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 5bbce29f143a..41f408068a7c 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -9,13 +9,13 @@
9 9
10#include "dm.h" 10#include "dm.h"
11#include "dm-snap.h" 11#include "dm-snap.h"
12#include "dm-io.h"
13#include "kcopyd.h"
14 12
15#include <linux/mm.h> 13#include <linux/mm.h>
16#include <linux/pagemap.h> 14#include <linux/pagemap.h>
17#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
18#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/dm-io.h>
18#include <linux/dm-kcopyd.h>
19 19
20#define DM_MSG_PREFIX "snapshots" 20#define DM_MSG_PREFIX "snapshots"
21#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 21#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */
@@ -131,7 +131,7 @@ struct pstore {
131 131
132static unsigned sectors_to_pages(unsigned sectors) 132static unsigned sectors_to_pages(unsigned sectors)
133{ 133{
134 return sectors / (PAGE_SIZE >> 9); 134 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
135} 135}
136 136
137static int alloc_area(struct pstore *ps) 137static int alloc_area(struct pstore *ps)
@@ -159,7 +159,7 @@ static void free_area(struct pstore *ps)
159} 159}
160 160
161struct mdata_req { 161struct mdata_req {
162 struct io_region *where; 162 struct dm_io_region *where;
163 struct dm_io_request *io_req; 163 struct dm_io_request *io_req;
164 struct work_struct work; 164 struct work_struct work;
165 int result; 165 int result;
@@ -177,7 +177,7 @@ static void do_metadata(struct work_struct *work)
177 */ 177 */
178static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) 178static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata)
179{ 179{
180 struct io_region where = { 180 struct dm_io_region where = {
181 .bdev = ps->snap->cow->bdev, 181 .bdev = ps->snap->cow->bdev,
182 .sector = ps->snap->chunk_size * chunk, 182 .sector = ps->snap->chunk_size * chunk,
183 .count = ps->snap->chunk_size, 183 .count = ps->snap->chunk_size,
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 8f25f628ef16..4789c42d9a3a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -5,13 +5,14 @@
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8#include "dm-io.h" 8#include "dm.h"
9 9
10#include <linux/bio.h> 10#include <linux/bio.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/dm-io.h>
15 16
16struct dm_io_client { 17struct dm_io_client {
17 mempool_t *pool; 18 mempool_t *pool;
@@ -20,7 +21,7 @@ struct dm_io_client {
20 21
21/* FIXME: can we shrink this ? */ 22/* FIXME: can we shrink this ? */
22struct io { 23struct io {
23 unsigned long error; 24 unsigned long error_bits;
24 atomic_t count; 25 atomic_t count;
25 struct task_struct *sleeper; 26 struct task_struct *sleeper;
26 struct dm_io_client *client; 27 struct dm_io_client *client;
@@ -107,14 +108,14 @@ static inline unsigned bio_get_region(struct bio *bio)
107static void dec_count(struct io *io, unsigned int region, int error) 108static void dec_count(struct io *io, unsigned int region, int error)
108{ 109{
109 if (error) 110 if (error)
110 set_bit(region, &io->error); 111 set_bit(region, &io->error_bits);
111 112
112 if (atomic_dec_and_test(&io->count)) { 113 if (atomic_dec_and_test(&io->count)) {
113 if (io->sleeper) 114 if (io->sleeper)
114 wake_up_process(io->sleeper); 115 wake_up_process(io->sleeper);
115 116
116 else { 117 else {
117 unsigned long r = io->error; 118 unsigned long r = io->error_bits;
118 io_notify_fn fn = io->callback; 119 io_notify_fn fn = io->callback;
119 void *context = io->context; 120 void *context = io->context;
120 121
@@ -271,7 +272,7 @@ static void km_dp_init(struct dpages *dp, void *data)
271/*----------------------------------------------------------------- 272/*-----------------------------------------------------------------
272 * IO routines that accept a list of pages. 273 * IO routines that accept a list of pages.
273 *---------------------------------------------------------------*/ 274 *---------------------------------------------------------------*/
274static void do_region(int rw, unsigned int region, struct io_region *where, 275static void do_region(int rw, unsigned region, struct dm_io_region *where,
275 struct dpages *dp, struct io *io) 276 struct dpages *dp, struct io *io)
276{ 277{
277 struct bio *bio; 278 struct bio *bio;
@@ -320,7 +321,7 @@ static void do_region(int rw, unsigned int region, struct io_region *where,
320} 321}
321 322
322static void dispatch_io(int rw, unsigned int num_regions, 323static void dispatch_io(int rw, unsigned int num_regions,
323 struct io_region *where, struct dpages *dp, 324 struct dm_io_region *where, struct dpages *dp,
324 struct io *io, int sync) 325 struct io *io, int sync)
325{ 326{
326 int i; 327 int i;
@@ -347,17 +348,17 @@ static void dispatch_io(int rw, unsigned int num_regions,
347} 348}
348 349
349static int sync_io(struct dm_io_client *client, unsigned int num_regions, 350static int sync_io(struct dm_io_client *client, unsigned int num_regions,
350 struct io_region *where, int rw, struct dpages *dp, 351 struct dm_io_region *where, int rw, struct dpages *dp,
351 unsigned long *error_bits) 352 unsigned long *error_bits)
352{ 353{
353 struct io io; 354 struct io io;
354 355
355 if (num_regions > 1 && rw != WRITE) { 356 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
356 WARN_ON(1); 357 WARN_ON(1);
357 return -EIO; 358 return -EIO;
358 } 359 }
359 360
360 io.error = 0; 361 io.error_bits = 0;
361 atomic_set(&io.count, 1); /* see dispatch_io() */ 362 atomic_set(&io.count, 1); /* see dispatch_io() */
362 io.sleeper = current; 363 io.sleeper = current;
363 io.client = client; 364 io.client = client;
@@ -378,25 +379,25 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
378 return -EINTR; 379 return -EINTR;
379 380
380 if (error_bits) 381 if (error_bits)
381 *error_bits = io.error; 382 *error_bits = io.error_bits;
382 383
383 return io.error ? -EIO : 0; 384 return io.error_bits ? -EIO : 0;
384} 385}
385 386
386static int async_io(struct dm_io_client *client, unsigned int num_regions, 387static int async_io(struct dm_io_client *client, unsigned int num_regions,
387 struct io_region *where, int rw, struct dpages *dp, 388 struct dm_io_region *where, int rw, struct dpages *dp,
388 io_notify_fn fn, void *context) 389 io_notify_fn fn, void *context)
389{ 390{
390 struct io *io; 391 struct io *io;
391 392
392 if (num_regions > 1 && rw != WRITE) { 393 if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
393 WARN_ON(1); 394 WARN_ON(1);
394 fn(1, context); 395 fn(1, context);
395 return -EIO; 396 return -EIO;
396 } 397 }
397 398
398 io = mempool_alloc(client->pool, GFP_NOIO); 399 io = mempool_alloc(client->pool, GFP_NOIO);
399 io->error = 0; 400 io->error_bits = 0;
400 atomic_set(&io->count, 1); /* see dispatch_io() */ 401 atomic_set(&io->count, 1); /* see dispatch_io() */
401 io->sleeper = NULL; 402 io->sleeper = NULL;
402 io->client = client; 403 io->client = client;
@@ -435,10 +436,15 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
435} 436}
436 437
437/* 438/*
438 * New collapsed (a)synchronous interface 439 * New collapsed (a)synchronous interface.
440 *
441 * If the IO is asynchronous (i.e. it has notify.fn), you must either unplug
442 * the queue with blk_unplug() some time later or set the BIO_RW_SYNC bit in
443 * io_req->bi_rw. If you fail to do one of these, the IO will be submitted to
444 * the disk after q->unplug_delay, which defaults to 3ms in blk-settings.c.
439 */ 445 */
440int dm_io(struct dm_io_request *io_req, unsigned num_regions, 446int dm_io(struct dm_io_request *io_req, unsigned num_regions,
441 struct io_region *where, unsigned long *sync_error_bits) 447 struct dm_io_region *where, unsigned long *sync_error_bits)
442{ 448{
443 int r; 449 int r;
444 struct dpages dp; 450 struct dpages dp;
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
deleted file mode 100644
index f647e2cceaa6..000000000000
--- a/drivers/md/dm-io.h
+++ /dev/null
@@ -1,79 +0,0 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _DM_IO_H
8#define _DM_IO_H
9
10#include "dm.h"
11
12struct io_region {
13 struct block_device *bdev;
14 sector_t sector;
15 sector_t count; /* If this is zero the region is ignored. */
16};
17
18struct page_list {
19 struct page_list *next;
20 struct page *page;
21};
22
23typedef void (*io_notify_fn)(unsigned long error, void *context);
24
25enum dm_io_mem_type {
26 DM_IO_PAGE_LIST,/* Page list */
27 DM_IO_BVEC, /* Bio vector */
28 DM_IO_VMA, /* Virtual memory area */
29 DM_IO_KMEM, /* Kernel memory */
30};
31
32struct dm_io_memory {
33 enum dm_io_mem_type type;
34
35 union {
36 struct page_list *pl;
37 struct bio_vec *bvec;
38 void *vma;
39 void *addr;
40 } ptr;
41
42 unsigned offset;
43};
44
45struct dm_io_notify {
46 io_notify_fn fn; /* Callback for asynchronous requests */
47 void *context; /* Passed to callback */
48};
49
50/*
51 * IO request structure
52 */
53struct dm_io_client;
54struct dm_io_request {
55 int bi_rw; /* READ|WRITE - not READA */
56 struct dm_io_memory mem; /* Memory to use for io */
57 struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */
58 struct dm_io_client *client; /* Client memory handler */
59};
60
61/*
62 * For async io calls, users can alternatively use the dm_io() function below
63 * and dm_io_client_create() to create private mempools for the client.
64 *
65 * Create/destroy may block.
66 */
67struct dm_io_client *dm_io_client_create(unsigned num_pages);
68int dm_io_client_resize(unsigned num_pages, struct dm_io_client *client);
69void dm_io_client_destroy(struct dm_io_client *client);
70
71/*
72 * IO interface using private per-client pools.
73 * Each bit in the optional 'sync_error_bits' bitset indicates whether an
74 * error occurred doing io to the corresponding region.
75 */
76int dm_io(struct dm_io_request *io_req, unsigned num_regions,
77 struct io_region *region, unsigned long *sync_error_bits);
78
79#endif
diff --git a/drivers/md/kcopyd.c b/drivers/md/dm-kcopyd.c
index e76b52ade690..996802b8a452 100644
--- a/drivers/md/kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -9,9 +9,8 @@
9 * completion notification. 9 * completion notification.
10 */ 10 */
11 11
12#include <asm/types.h> 12#include <linux/types.h>
13#include <asm/atomic.h> 13#include <asm/atomic.h>
14
15#include <linux/blkdev.h> 14#include <linux/blkdev.h>
16#include <linux/fs.h> 15#include <linux/fs.h>
17#include <linux/init.h> 16#include <linux/init.h>
@@ -23,24 +22,15 @@
23#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
24#include <linux/workqueue.h> 23#include <linux/workqueue.h>
25#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/dm-kcopyd.h>
26 26
27#include "kcopyd.h" 27#include "dm.h"
28
29static struct workqueue_struct *_kcopyd_wq;
30static struct work_struct _kcopyd_work;
31
32static void wake(void)
33{
34 queue_work(_kcopyd_wq, &_kcopyd_work);
35}
36 28
37/*----------------------------------------------------------------- 29/*-----------------------------------------------------------------
38 * Each kcopyd client has its own little pool of preallocated 30 * Each kcopyd client has its own little pool of preallocated
39 * pages for kcopyd io. 31 * pages for kcopyd io.
40 *---------------------------------------------------------------*/ 32 *---------------------------------------------------------------*/
41struct kcopyd_client { 33struct dm_kcopyd_client {
42 struct list_head list;
43
44 spinlock_t lock; 34 spinlock_t lock;
45 struct page_list *pages; 35 struct page_list *pages;
46 unsigned int nr_pages; 36 unsigned int nr_pages;
@@ -50,8 +40,32 @@ struct kcopyd_client {
50 40
51 wait_queue_head_t destroyq; 41 wait_queue_head_t destroyq;
52 atomic_t nr_jobs; 42 atomic_t nr_jobs;
43
44 mempool_t *job_pool;
45
46 struct workqueue_struct *kcopyd_wq;
47 struct work_struct kcopyd_work;
48
49/*
50 * We maintain three lists of jobs:
51 *
52 * i) jobs waiting for pages
53 * ii) jobs that have pages, and are waiting for the io to be issued.
54 * iii) jobs that have completed.
55 *
56 * All three of these are protected by job_lock.
57 */
58 spinlock_t job_lock;
59 struct list_head complete_jobs;
60 struct list_head io_jobs;
61 struct list_head pages_jobs;
53}; 62};
54 63
64static void wake(struct dm_kcopyd_client *kc)
65{
66 queue_work(kc->kcopyd_wq, &kc->kcopyd_work);
67}
68
55static struct page_list *alloc_pl(void) 69static struct page_list *alloc_pl(void)
56{ 70{
57 struct page_list *pl; 71 struct page_list *pl;
@@ -75,7 +89,7 @@ static void free_pl(struct page_list *pl)
75 kfree(pl); 89 kfree(pl);
76} 90}
77 91
78static int kcopyd_get_pages(struct kcopyd_client *kc, 92static int kcopyd_get_pages(struct dm_kcopyd_client *kc,
79 unsigned int nr, struct page_list **pages) 93 unsigned int nr, struct page_list **pages)
80{ 94{
81 struct page_list *pl; 95 struct page_list *pl;
@@ -98,7 +112,7 @@ static int kcopyd_get_pages(struct kcopyd_client *kc,
98 return 0; 112 return 0;
99} 113}
100 114
101static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl) 115static void kcopyd_put_pages(struct dm_kcopyd_client *kc, struct page_list *pl)
102{ 116{
103 struct page_list *cursor; 117 struct page_list *cursor;
104 118
@@ -126,7 +140,7 @@ static void drop_pages(struct page_list *pl)
126 } 140 }
127} 141}
128 142
129static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr) 143static int client_alloc_pages(struct dm_kcopyd_client *kc, unsigned int nr)
130{ 144{
131 unsigned int i; 145 unsigned int i;
132 struct page_list *pl = NULL, *next; 146 struct page_list *pl = NULL, *next;
@@ -147,7 +161,7 @@ static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
147 return 0; 161 return 0;
148} 162}
149 163
150static void client_free_pages(struct kcopyd_client *kc) 164static void client_free_pages(struct dm_kcopyd_client *kc)
151{ 165{
152 BUG_ON(kc->nr_free_pages != kc->nr_pages); 166 BUG_ON(kc->nr_free_pages != kc->nr_pages);
153 drop_pages(kc->pages); 167 drop_pages(kc->pages);
@@ -161,7 +175,7 @@ static void client_free_pages(struct kcopyd_client *kc)
161 * ever having to do io (which could cause a deadlock). 175 * ever having to do io (which could cause a deadlock).
162 *---------------------------------------------------------------*/ 176 *---------------------------------------------------------------*/
163struct kcopyd_job { 177struct kcopyd_job {
164 struct kcopyd_client *kc; 178 struct dm_kcopyd_client *kc;
165 struct list_head list; 179 struct list_head list;
166 unsigned long flags; 180 unsigned long flags;
167 181
@@ -175,13 +189,13 @@ struct kcopyd_job {
175 * Either READ or WRITE 189 * Either READ or WRITE
176 */ 190 */
177 int rw; 191 int rw;
178 struct io_region source; 192 struct dm_io_region source;
179 193
180 /* 194 /*
181 * The destinations for the transfer. 195 * The destinations for the transfer.
182 */ 196 */
183 unsigned int num_dests; 197 unsigned int num_dests;
184 struct io_region dests[KCOPYD_MAX_REGIONS]; 198 struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
185 199
186 sector_t offset; 200 sector_t offset;
187 unsigned int nr_pages; 201 unsigned int nr_pages;
@@ -191,7 +205,7 @@ struct kcopyd_job {
191 * Set this to ensure you are notified when the job has 205 * Set this to ensure you are notified when the job has
192 * completed. 'context' is for callback to use. 206 * completed. 'context' is for callback to use.
193 */ 207 */
194 kcopyd_notify_fn fn; 208 dm_kcopyd_notify_fn fn;
195 void *context; 209 void *context;
196 210
197 /* 211 /*
@@ -207,47 +221,19 @@ struct kcopyd_job {
207#define MIN_JOBS 512 221#define MIN_JOBS 512
208 222
209static struct kmem_cache *_job_cache; 223static struct kmem_cache *_job_cache;
210static mempool_t *_job_pool;
211 224
212/* 225int __init dm_kcopyd_init(void)
213 * We maintain three lists of jobs:
214 *
215 * i) jobs waiting for pages
216 * ii) jobs that have pages, and are waiting for the io to be issued.
217 * iii) jobs that have completed.
218 *
219 * All three of these are protected by job_lock.
220 */
221static DEFINE_SPINLOCK(_job_lock);
222
223static LIST_HEAD(_complete_jobs);
224static LIST_HEAD(_io_jobs);
225static LIST_HEAD(_pages_jobs);
226
227static int jobs_init(void)
228{ 226{
229 _job_cache = KMEM_CACHE(kcopyd_job, 0); 227 _job_cache = KMEM_CACHE(kcopyd_job, 0);
230 if (!_job_cache) 228 if (!_job_cache)
231 return -ENOMEM; 229 return -ENOMEM;
232 230
233 _job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
234 if (!_job_pool) {
235 kmem_cache_destroy(_job_cache);
236 return -ENOMEM;
237 }
238
239 return 0; 231 return 0;
240} 232}
241 233
242static void jobs_exit(void) 234void dm_kcopyd_exit(void)
243{ 235{
244 BUG_ON(!list_empty(&_complete_jobs));
245 BUG_ON(!list_empty(&_io_jobs));
246 BUG_ON(!list_empty(&_pages_jobs));
247
248 mempool_destroy(_job_pool);
249 kmem_cache_destroy(_job_cache); 236 kmem_cache_destroy(_job_cache);
250 _job_pool = NULL;
251 _job_cache = NULL; 237 _job_cache = NULL;
252} 238}
253 239
@@ -255,18 +241,19 @@ static void jobs_exit(void)
255 * Functions to push and pop a job onto the head of a given job 241 * Functions to push and pop a job onto the head of a given job
256 * list. 242 * list.
257 */ 243 */
258static struct kcopyd_job *pop(struct list_head *jobs) 244static struct kcopyd_job *pop(struct list_head *jobs,
245 struct dm_kcopyd_client *kc)
259{ 246{
260 struct kcopyd_job *job = NULL; 247 struct kcopyd_job *job = NULL;
261 unsigned long flags; 248 unsigned long flags;
262 249
263 spin_lock_irqsave(&_job_lock, flags); 250 spin_lock_irqsave(&kc->job_lock, flags);
264 251
265 if (!list_empty(jobs)) { 252 if (!list_empty(jobs)) {
266 job = list_entry(jobs->next, struct kcopyd_job, list); 253 job = list_entry(jobs->next, struct kcopyd_job, list);
267 list_del(&job->list); 254 list_del(&job->list);
268 } 255 }
269 spin_unlock_irqrestore(&_job_lock, flags); 256 spin_unlock_irqrestore(&kc->job_lock, flags);
270 257
271 return job; 258 return job;
272} 259}
@@ -274,10 +261,11 @@ static struct kcopyd_job *pop(struct list_head *jobs)
274static void push(struct list_head *jobs, struct kcopyd_job *job) 261static void push(struct list_head *jobs, struct kcopyd_job *job)
275{ 262{
276 unsigned long flags; 263 unsigned long flags;
264 struct dm_kcopyd_client *kc = job->kc;
277 265
278 spin_lock_irqsave(&_job_lock, flags); 266 spin_lock_irqsave(&kc->job_lock, flags);
279 list_add_tail(&job->list, jobs); 267 list_add_tail(&job->list, jobs);
280 spin_unlock_irqrestore(&_job_lock, flags); 268 spin_unlock_irqrestore(&kc->job_lock, flags);
281} 269}
282 270
283/* 271/*
@@ -294,11 +282,11 @@ static int run_complete_job(struct kcopyd_job *job)
294 void *context = job->context; 282 void *context = job->context;
295 int read_err = job->read_err; 283 int read_err = job->read_err;
296 unsigned long write_err = job->write_err; 284 unsigned long write_err = job->write_err;
297 kcopyd_notify_fn fn = job->fn; 285 dm_kcopyd_notify_fn fn = job->fn;
298 struct kcopyd_client *kc = job->kc; 286 struct dm_kcopyd_client *kc = job->kc;
299 287
300 kcopyd_put_pages(kc, job->pages); 288 kcopyd_put_pages(kc, job->pages);
301 mempool_free(job, _job_pool); 289 mempool_free(job, kc->job_pool);
302 fn(read_err, write_err, context); 290 fn(read_err, write_err, context);
303 291
304 if (atomic_dec_and_test(&kc->nr_jobs)) 292 if (atomic_dec_and_test(&kc->nr_jobs))
@@ -310,6 +298,7 @@ static int run_complete_job(struct kcopyd_job *job)
310static void complete_io(unsigned long error, void *context) 298static void complete_io(unsigned long error, void *context)
311{ 299{
312 struct kcopyd_job *job = (struct kcopyd_job *) context; 300 struct kcopyd_job *job = (struct kcopyd_job *) context;
301 struct dm_kcopyd_client *kc = job->kc;
313 302
314 if (error) { 303 if (error) {
315 if (job->rw == WRITE) 304 if (job->rw == WRITE)
@@ -317,22 +306,22 @@ static void complete_io(unsigned long error, void *context)
317 else 306 else
318 job->read_err = 1; 307 job->read_err = 1;
319 308
320 if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { 309 if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
321 push(&_complete_jobs, job); 310 push(&kc->complete_jobs, job);
322 wake(); 311 wake(kc);
323 return; 312 return;
324 } 313 }
325 } 314 }
326 315
327 if (job->rw == WRITE) 316 if (job->rw == WRITE)
328 push(&_complete_jobs, job); 317 push(&kc->complete_jobs, job);
329 318
330 else { 319 else {
331 job->rw = WRITE; 320 job->rw = WRITE;
332 push(&_io_jobs, job); 321 push(&kc->io_jobs, job);
333 } 322 }
334 323
335 wake(); 324 wake(kc);
336} 325}
337 326
338/* 327/*
@@ -343,7 +332,7 @@ static int run_io_job(struct kcopyd_job *job)
343{ 332{
344 int r; 333 int r;
345 struct dm_io_request io_req = { 334 struct dm_io_request io_req = {
346 .bi_rw = job->rw, 335 .bi_rw = job->rw | (1 << BIO_RW_SYNC),
347 .mem.type = DM_IO_PAGE_LIST, 336 .mem.type = DM_IO_PAGE_LIST,
348 .mem.ptr.pl = job->pages, 337 .mem.ptr.pl = job->pages,
349 .mem.offset = job->offset, 338 .mem.offset = job->offset,
@@ -369,7 +358,7 @@ static int run_pages_job(struct kcopyd_job *job)
369 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages); 358 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
370 if (!r) { 359 if (!r) {
371 /* this job is ready for io */ 360 /* this job is ready for io */
372 push(&_io_jobs, job); 361 push(&job->kc->io_jobs, job);
373 return 0; 362 return 0;
374 } 363 }
375 364
@@ -384,12 +373,13 @@ static int run_pages_job(struct kcopyd_job *job)
384 * Run through a list for as long as possible. Returns the count 373 * Run through a list for as long as possible. Returns the count
385 * of successful jobs. 374 * of successful jobs.
386 */ 375 */
387static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *)) 376static int process_jobs(struct list_head *jobs, struct dm_kcopyd_client *kc,
377 int (*fn) (struct kcopyd_job *))
388{ 378{
389 struct kcopyd_job *job; 379 struct kcopyd_job *job;
390 int r, count = 0; 380 int r, count = 0;
391 381
392 while ((job = pop(jobs))) { 382 while ((job = pop(jobs, kc))) {
393 383
394 r = fn(job); 384 r = fn(job);
395 385
@@ -399,7 +389,7 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
399 job->write_err = (unsigned long) -1L; 389 job->write_err = (unsigned long) -1L;
400 else 390 else
401 job->read_err = 1; 391 job->read_err = 1;
402 push(&_complete_jobs, job); 392 push(&kc->complete_jobs, job);
403 break; 393 break;
404 } 394 }
405 395
@@ -421,8 +411,11 @@ static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
421/* 411/*
422 * kcopyd does this every time it's woken up. 412 * kcopyd does this every time it's woken up.
423 */ 413 */
424static void do_work(struct work_struct *ignored) 414static void do_work(struct work_struct *work)
425{ 415{
416 struct dm_kcopyd_client *kc = container_of(work,
417 struct dm_kcopyd_client, kcopyd_work);
418
426 /* 419 /*
427 * The order that these are called is *very* important. 420 * The order that these are called is *very* important.
428 * complete jobs can free some pages for pages jobs. 421 * complete jobs can free some pages for pages jobs.
@@ -430,9 +423,9 @@ static void do_work(struct work_struct *ignored)
430 * list. io jobs call wake when they complete and it all 423 * list. io jobs call wake when they complete and it all
431 * starts again. 424 * starts again.
432 */ 425 */
433 process_jobs(&_complete_jobs, run_complete_job); 426 process_jobs(&kc->complete_jobs, kc, run_complete_job);
434 process_jobs(&_pages_jobs, run_pages_job); 427 process_jobs(&kc->pages_jobs, kc, run_pages_job);
435 process_jobs(&_io_jobs, run_io_job); 428 process_jobs(&kc->io_jobs, kc, run_io_job);
436} 429}
437 430
438/* 431/*
@@ -442,9 +435,10 @@ static void do_work(struct work_struct *ignored)
442 */ 435 */
443static void dispatch_job(struct kcopyd_job *job) 436static void dispatch_job(struct kcopyd_job *job)
444{ 437{
445 atomic_inc(&job->kc->nr_jobs); 438 struct dm_kcopyd_client *kc = job->kc;
446 push(&_pages_jobs, job); 439 atomic_inc(&kc->nr_jobs);
447 wake(); 440 push(&kc->pages_jobs, job);
441 wake(kc);
448} 442}
449 443
450#define SUB_JOB_SIZE 128 444#define SUB_JOB_SIZE 128
@@ -469,7 +463,7 @@ static void segment_complete(int read_err, unsigned long write_err,
469 * Only dispatch more work if there hasn't been an error. 463 * Only dispatch more work if there hasn't been an error.
470 */ 464 */
471 if ((!job->read_err && !job->write_err) || 465 if ((!job->read_err && !job->write_err) ||
472 test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) { 466 test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
473 /* get the next chunk of work */ 467 /* get the next chunk of work */
474 progress = job->progress; 468 progress = job->progress;
475 count = job->source.count - progress; 469 count = job->source.count - progress;
@@ -484,7 +478,8 @@ static void segment_complete(int read_err, unsigned long write_err,
484 478
485 if (count) { 479 if (count) {
486 int i; 480 int i;
487 struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO); 481 struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
482 GFP_NOIO);
488 483
489 *sub_job = *job; 484 *sub_job = *job;
490 sub_job->source.sector += progress; 485 sub_job->source.sector += progress;
@@ -508,7 +503,7 @@ static void segment_complete(int read_err, unsigned long write_err,
508 * after we've completed. 503 * after we've completed.
509 */ 504 */
510 job->fn(read_err, write_err, job->context); 505 job->fn(read_err, write_err, job->context);
511 mempool_free(job, _job_pool); 506 mempool_free(job, job->kc->job_pool);
512 } 507 }
513} 508}
514 509
@@ -526,16 +521,16 @@ static void split_job(struct kcopyd_job *job)
526 segment_complete(0, 0u, job); 521 segment_complete(0, 0u, job);
527} 522}
528 523
529int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from, 524int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
530 unsigned int num_dests, struct io_region *dests, 525 unsigned int num_dests, struct dm_io_region *dests,
531 unsigned int flags, kcopyd_notify_fn fn, void *context) 526 unsigned int flags, dm_kcopyd_notify_fn fn, void *context)
532{ 527{
533 struct kcopyd_job *job; 528 struct kcopyd_job *job;
534 529
535 /* 530 /*
536 * Allocate a new job. 531 * Allocate a new job.
537 */ 532 */
538 job = mempool_alloc(_job_pool, GFP_NOIO); 533 job = mempool_alloc(kc->job_pool, GFP_NOIO);
539 534
540 /* 535 /*
541 * set up for the read. 536 * set up for the read.
@@ -569,6 +564,7 @@ int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
569 564
570 return 0; 565 return 0;
571} 566}
567EXPORT_SYMBOL(dm_kcopyd_copy);
572 568
573/* 569/*
574 * Cancels a kcopyd job, eg. someone might be deactivating a 570 * Cancels a kcopyd job, eg. someone might be deactivating a
@@ -583,126 +579,76 @@ int kcopyd_cancel(struct kcopyd_job *job, int block)
583#endif /* 0 */ 579#endif /* 0 */
584 580
585/*----------------------------------------------------------------- 581/*-----------------------------------------------------------------
586 * Unit setup 582 * Client setup
587 *---------------------------------------------------------------*/ 583 *---------------------------------------------------------------*/
588static DEFINE_MUTEX(_client_lock); 584int dm_kcopyd_client_create(unsigned int nr_pages,
589static LIST_HEAD(_clients); 585 struct dm_kcopyd_client **result)
590
591static void client_add(struct kcopyd_client *kc)
592{ 586{
593 mutex_lock(&_client_lock); 587 int r = -ENOMEM;
594 list_add(&kc->list, &_clients); 588 struct dm_kcopyd_client *kc;
595 mutex_unlock(&_client_lock);
596}
597
598static void client_del(struct kcopyd_client *kc)
599{
600 mutex_lock(&_client_lock);
601 list_del(&kc->list);
602 mutex_unlock(&_client_lock);
603}
604
605static DEFINE_MUTEX(kcopyd_init_lock);
606static int kcopyd_clients = 0;
607 589
608static int kcopyd_init(void) 590 kc = kmalloc(sizeof(*kc), GFP_KERNEL);
609{ 591 if (!kc)
610 int r;
611
612 mutex_lock(&kcopyd_init_lock);
613
614 if (kcopyd_clients) {
615 /* Already initialized. */
616 kcopyd_clients++;
617 mutex_unlock(&kcopyd_init_lock);
618 return 0;
619 }
620
621 r = jobs_init();
622 if (r) {
623 mutex_unlock(&kcopyd_init_lock);
624 return r;
625 }
626
627 _kcopyd_wq = create_singlethread_workqueue("kcopyd");
628 if (!_kcopyd_wq) {
629 jobs_exit();
630 mutex_unlock(&kcopyd_init_lock);
631 return -ENOMEM; 592 return -ENOMEM;
632 }
633
634 kcopyd_clients++;
635 INIT_WORK(&_kcopyd_work, do_work);
636 mutex_unlock(&kcopyd_init_lock);
637 return 0;
638}
639 593
640static void kcopyd_exit(void) 594 spin_lock_init(&kc->lock);
641{ 595 spin_lock_init(&kc->job_lock);
642 mutex_lock(&kcopyd_init_lock); 596 INIT_LIST_HEAD(&kc->complete_jobs);
643 kcopyd_clients--; 597 INIT_LIST_HEAD(&kc->io_jobs);
644 if (!kcopyd_clients) { 598 INIT_LIST_HEAD(&kc->pages_jobs);
645 jobs_exit();
646 destroy_workqueue(_kcopyd_wq);
647 _kcopyd_wq = NULL;
648 }
649 mutex_unlock(&kcopyd_init_lock);
650}
651
652int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
653{
654 int r = 0;
655 struct kcopyd_client *kc;
656 599
657 r = kcopyd_init(); 600 kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
658 if (r) 601 if (!kc->job_pool)
659 return r; 602 goto bad_slab;
660 603
661 kc = kmalloc(sizeof(*kc), GFP_KERNEL); 604 INIT_WORK(&kc->kcopyd_work, do_work);
662 if (!kc) { 605 kc->kcopyd_wq = create_singlethread_workqueue("kcopyd");
663 kcopyd_exit(); 606 if (!kc->kcopyd_wq)
664 return -ENOMEM; 607 goto bad_workqueue;
665 }
666 608
667 spin_lock_init(&kc->lock);
668 kc->pages = NULL; 609 kc->pages = NULL;
669 kc->nr_pages = kc->nr_free_pages = 0; 610 kc->nr_pages = kc->nr_free_pages = 0;
670 r = client_alloc_pages(kc, nr_pages); 611 r = client_alloc_pages(kc, nr_pages);
671 if (r) { 612 if (r)
672 kfree(kc); 613 goto bad_client_pages;
673 kcopyd_exit();
674 return r;
675 }
676 614
677 kc->io_client = dm_io_client_create(nr_pages); 615 kc->io_client = dm_io_client_create(nr_pages);
678 if (IS_ERR(kc->io_client)) { 616 if (IS_ERR(kc->io_client)) {
679 r = PTR_ERR(kc->io_client); 617 r = PTR_ERR(kc->io_client);
680 client_free_pages(kc); 618 goto bad_io_client;
681 kfree(kc);
682 kcopyd_exit();
683 return r;
684 } 619 }
685 620
686 init_waitqueue_head(&kc->destroyq); 621 init_waitqueue_head(&kc->destroyq);
687 atomic_set(&kc->nr_jobs, 0); 622 atomic_set(&kc->nr_jobs, 0);
688 623
689 client_add(kc);
690 *result = kc; 624 *result = kc;
691 return 0; 625 return 0;
626
627bad_io_client:
628 client_free_pages(kc);
629bad_client_pages:
630 destroy_workqueue(kc->kcopyd_wq);
631bad_workqueue:
632 mempool_destroy(kc->job_pool);
633bad_slab:
634 kfree(kc);
635
636 return r;
692} 637}
638EXPORT_SYMBOL(dm_kcopyd_client_create);
693 639
694void kcopyd_client_destroy(struct kcopyd_client *kc) 640void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
695{ 641{
696 /* Wait for completion of all jobs submitted by this client. */ 642 /* Wait for completion of all jobs submitted by this client. */
697 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); 643 wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
698 644
645 BUG_ON(!list_empty(&kc->complete_jobs));
646 BUG_ON(!list_empty(&kc->io_jobs));
647 BUG_ON(!list_empty(&kc->pages_jobs));
648 destroy_workqueue(kc->kcopyd_wq);
699 dm_io_client_destroy(kc->io_client); 649 dm_io_client_destroy(kc->io_client);
700 client_free_pages(kc); 650 client_free_pages(kc);
701 client_del(kc); 651 mempool_destroy(kc->job_pool);
702 kfree(kc); 652 kfree(kc);
703 kcopyd_exit();
704} 653}
705 654EXPORT_SYMBOL(dm_kcopyd_client_destroy);
706EXPORT_SYMBOL(kcopyd_client_create);
707EXPORT_SYMBOL(kcopyd_client_destroy);
708EXPORT_SYMBOL(kcopyd_copy);
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 2a74b2142f50..67a6f31b7fc3 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -1,5 +1,6 @@
1/* 1/*
2 * Copyright (C) 2003 Sistina Software 2 * Copyright (C) 2003 Sistina Software
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
3 * 4 *
4 * This file is released under the LGPL. 5 * This file is released under the LGPL.
5 */ 6 */
@@ -8,64 +9,58 @@
8#include <linux/slab.h> 9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/vmalloc.h> 11#include <linux/vmalloc.h>
12#include <linux/dm-io.h>
13#include <linux/dm-dirty-log.h>
11 14
12#include "dm-log.h" 15#include "dm.h"
13#include "dm-io.h"
14 16
15#define DM_MSG_PREFIX "mirror log" 17#define DM_MSG_PREFIX "dirty region log"
16 18
17static LIST_HEAD(_log_types); 19struct dm_dirty_log_internal {
18static DEFINE_SPINLOCK(_lock); 20 struct dm_dirty_log_type *type;
19 21
20int dm_register_dirty_log_type(struct dirty_log_type *type) 22 struct list_head list;
21{ 23 long use;
22 spin_lock(&_lock); 24};
23 type->use_count = 0;
24 list_add(&type->list, &_log_types);
25 spin_unlock(&_lock);
26 25
27 return 0; 26static LIST_HEAD(_log_types);
28} 27static DEFINE_SPINLOCK(_lock);
29 28
30int dm_unregister_dirty_log_type(struct dirty_log_type *type) 29static struct dm_dirty_log_internal *__find_dirty_log_type(const char *name)
31{ 30{
32 spin_lock(&_lock); 31 struct dm_dirty_log_internal *log_type;
33
34 if (type->use_count)
35 DMWARN("Attempt to unregister a log type that is still in use");
36 else
37 list_del(&type->list);
38 32
39 spin_unlock(&_lock); 33 list_for_each_entry(log_type, &_log_types, list)
34 if (!strcmp(name, log_type->type->name))
35 return log_type;
40 36
41 return 0; 37 return NULL;
42} 38}
43 39
44static struct dirty_log_type *_get_type(const char *type_name) 40static struct dm_dirty_log_internal *_get_dirty_log_type(const char *name)
45{ 41{
46 struct dirty_log_type *type; 42 struct dm_dirty_log_internal *log_type;
47 43
48 spin_lock(&_lock); 44 spin_lock(&_lock);
49 list_for_each_entry (type, &_log_types, list) 45
50 if (!strcmp(type_name, type->name)) { 46 log_type = __find_dirty_log_type(name);
51 if (!type->use_count && !try_module_get(type->module)){ 47 if (log_type) {
52 spin_unlock(&_lock); 48 if (!log_type->use && !try_module_get(log_type->type->module))
53 return NULL; 49 log_type = NULL;
54 } 50 else
55 type->use_count++; 51 log_type->use++;
56 spin_unlock(&_lock); 52 }
57 return type;
58 }
59 53
60 spin_unlock(&_lock); 54 spin_unlock(&_lock);
61 return NULL; 55
56 return log_type;
62} 57}
63 58
64/* 59/*
65 * get_type 60 * get_type
66 * @type_name 61 * @type_name
67 * 62 *
68 * Attempt to retrieve the dirty_log_type by name. If not already 63 * Attempt to retrieve the dm_dirty_log_type by name. If not already
69 * available, attempt to load the appropriate module. 64 * available, attempt to load the appropriate module.
70 * 65 *
71 * Log modules are named "dm-log-" followed by the 'type_name'. 66 * Log modules are named "dm-log-" followed by the 'type_name'.
@@ -78,14 +73,17 @@ static struct dirty_log_type *_get_type(const char *type_name)
78 * 73 *
79 * Returns: dirty_log_type* on success, NULL on failure 74 * Returns: dirty_log_type* on success, NULL on failure
80 */ 75 */
81static struct dirty_log_type *get_type(const char *type_name) 76static struct dm_dirty_log_type *get_type(const char *type_name)
82{ 77{
83 char *p, *type_name_dup; 78 char *p, *type_name_dup;
84 struct dirty_log_type *type; 79 struct dm_dirty_log_internal *log_type;
80
81 if (!type_name)
82 return NULL;
85 83
86 type = _get_type(type_name); 84 log_type = _get_dirty_log_type(type_name);
87 if (type) 85 if (log_type)
88 return type; 86 return log_type->type;
89 87
90 type_name_dup = kstrdup(type_name, GFP_KERNEL); 88 type_name_dup = kstrdup(type_name, GFP_KERNEL);
91 if (!type_name_dup) { 89 if (!type_name_dup) {
@@ -95,34 +93,106 @@ static struct dirty_log_type *get_type(const char *type_name)
95 } 93 }
96 94
97 while (request_module("dm-log-%s", type_name_dup) || 95 while (request_module("dm-log-%s", type_name_dup) ||
98 !(type = _get_type(type_name))) { 96 !(log_type = _get_dirty_log_type(type_name))) {
99 p = strrchr(type_name_dup, '-'); 97 p = strrchr(type_name_dup, '-');
100 if (!p) 98 if (!p)
101 break; 99 break;
102 p[0] = '\0'; 100 p[0] = '\0';
103 } 101 }
104 102
105 if (!type) 103 if (!log_type)
106 DMWARN("Module for logging type \"%s\" not found.", type_name); 104 DMWARN("Module for logging type \"%s\" not found.", type_name);
107 105
108 kfree(type_name_dup); 106 kfree(type_name_dup);
109 107
110 return type; 108 return log_type ? log_type->type : NULL;
111} 109}
112 110
113static void put_type(struct dirty_log_type *type) 111static void put_type(struct dm_dirty_log_type *type)
114{ 112{
113 struct dm_dirty_log_internal *log_type;
114
115 if (!type)
116 return;
117
115 spin_lock(&_lock); 118 spin_lock(&_lock);
116 if (!--type->use_count) 119 log_type = __find_dirty_log_type(type->name);
120 if (!log_type)
121 goto out;
122
123 if (!--log_type->use)
117 module_put(type->module); 124 module_put(type->module);
125
126 BUG_ON(log_type->use < 0);
127
128out:
118 spin_unlock(&_lock); 129 spin_unlock(&_lock);
119} 130}
120 131
121struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti, 132static struct dm_dirty_log_internal *_alloc_dirty_log_type(struct dm_dirty_log_type *type)
122 unsigned int argc, char **argv)
123{ 133{
124 struct dirty_log_type *type; 134 struct dm_dirty_log_internal *log_type = kzalloc(sizeof(*log_type),
125 struct dirty_log *log; 135 GFP_KERNEL);
136
137 if (log_type)
138 log_type->type = type;
139
140 return log_type;
141}
142
143int dm_dirty_log_type_register(struct dm_dirty_log_type *type)
144{
145 struct dm_dirty_log_internal *log_type = _alloc_dirty_log_type(type);
146 int r = 0;
147
148 if (!log_type)
149 return -ENOMEM;
150
151 spin_lock(&_lock);
152 if (!__find_dirty_log_type(type->name))
153 list_add(&log_type->list, &_log_types);
154 else {
155 kfree(log_type);
156 r = -EEXIST;
157 }
158 spin_unlock(&_lock);
159
160 return r;
161}
162EXPORT_SYMBOL(dm_dirty_log_type_register);
163
164int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
165{
166 struct dm_dirty_log_internal *log_type;
167
168 spin_lock(&_lock);
169
170 log_type = __find_dirty_log_type(type->name);
171 if (!log_type) {
172 spin_unlock(&_lock);
173 return -EINVAL;
174 }
175
176 if (log_type->use) {
177 spin_unlock(&_lock);
178 return -ETXTBSY;
179 }
180
181 list_del(&log_type->list);
182
183 spin_unlock(&_lock);
184 kfree(log_type);
185
186 return 0;
187}
188EXPORT_SYMBOL(dm_dirty_log_type_unregister);
189
190struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
191 struct dm_target *ti,
192 unsigned int argc, char **argv)
193{
194 struct dm_dirty_log_type *type;
195 struct dm_dirty_log *log;
126 196
127 log = kmalloc(sizeof(*log), GFP_KERNEL); 197 log = kmalloc(sizeof(*log), GFP_KERNEL);
128 if (!log) 198 if (!log)
@@ -143,13 +213,15 @@ struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *t
143 213
144 return log; 214 return log;
145} 215}
216EXPORT_SYMBOL(dm_dirty_log_create);
146 217
147void dm_destroy_dirty_log(struct dirty_log *log) 218void dm_dirty_log_destroy(struct dm_dirty_log *log)
148{ 219{
149 log->type->dtr(log); 220 log->type->dtr(log);
150 put_type(log->type); 221 put_type(log->type);
151 kfree(log); 222 kfree(log);
152} 223}
224EXPORT_SYMBOL(dm_dirty_log_destroy);
153 225
154/*----------------------------------------------------------------- 226/*-----------------------------------------------------------------
155 * Persistent and core logs share a lot of their implementation. 227 * Persistent and core logs share a lot of their implementation.
@@ -207,7 +279,7 @@ struct log_c {
207 struct dm_dev *log_dev; 279 struct dm_dev *log_dev;
208 struct log_header header; 280 struct log_header header;
209 281
210 struct io_region header_location; 282 struct dm_io_region header_location;
211 struct log_header *disk_header; 283 struct log_header *disk_header;
212}; 284};
213 285
@@ -215,7 +287,7 @@ struct log_c {
215 * The touched member needs to be updated every time we access 287 * The touched member needs to be updated every time we access
216 * one of the bitsets. 288 * one of the bitsets.
217 */ 289 */
218static inline int log_test_bit(uint32_t *bs, unsigned bit) 290static inline int log_test_bit(uint32_t *bs, unsigned bit)
219{ 291{
220 return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0; 292 return ext2_test_bit(bit, (unsigned long *) bs) ? 1 : 0;
221} 293}
@@ -302,7 +374,7 @@ static inline int write_header(struct log_c *log)
302 * argv contains region_size followed optionally by [no]sync 374 * argv contains region_size followed optionally by [no]sync
303 *--------------------------------------------------------------*/ 375 *--------------------------------------------------------------*/
304#define BYTE_SHIFT 3 376#define BYTE_SHIFT 3
305static int create_log_context(struct dirty_log *log, struct dm_target *ti, 377static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
306 unsigned int argc, char **argv, 378 unsigned int argc, char **argv,
307 struct dm_dev *dev) 379 struct dm_dev *dev)
308{ 380{
@@ -315,7 +387,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
315 int r; 387 int r;
316 388
317 if (argc < 1 || argc > 2) { 389 if (argc < 1 || argc > 2) {
318 DMWARN("wrong number of arguments to mirror log"); 390 DMWARN("wrong number of arguments to dirty region log");
319 return -EINVAL; 391 return -EINVAL;
320 } 392 }
321 393
@@ -325,8 +397,8 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
325 else if (!strcmp(argv[1], "nosync")) 397 else if (!strcmp(argv[1], "nosync"))
326 sync = NOSYNC; 398 sync = NOSYNC;
327 else { 399 else {
328 DMWARN("unrecognised sync argument to mirror log: %s", 400 DMWARN("unrecognised sync argument to "
329 argv[1]); 401 "dirty region log: %s", argv[1]);
330 return -EINVAL; 402 return -EINVAL;
331 } 403 }
332 } 404 }
@@ -434,7 +506,7 @@ static int create_log_context(struct dirty_log *log, struct dm_target *ti,
434 return 0; 506 return 0;
435} 507}
436 508
437static int core_ctr(struct dirty_log *log, struct dm_target *ti, 509static int core_ctr(struct dm_dirty_log *log, struct dm_target *ti,
438 unsigned int argc, char **argv) 510 unsigned int argc, char **argv)
439{ 511{
440 return create_log_context(log, ti, argc, argv, NULL); 512 return create_log_context(log, ti, argc, argv, NULL);
@@ -447,7 +519,7 @@ static void destroy_log_context(struct log_c *lc)
447 kfree(lc); 519 kfree(lc);
448} 520}
449 521
450static void core_dtr(struct dirty_log *log) 522static void core_dtr(struct dm_dirty_log *log)
451{ 523{
452 struct log_c *lc = (struct log_c *) log->context; 524 struct log_c *lc = (struct log_c *) log->context;
453 525
@@ -460,14 +532,14 @@ static void core_dtr(struct dirty_log *log)
460 * 532 *
461 * argv contains log_device region_size followed optionally by [no]sync 533 * argv contains log_device region_size followed optionally by [no]sync
462 *--------------------------------------------------------------*/ 534 *--------------------------------------------------------------*/
463static int disk_ctr(struct dirty_log *log, struct dm_target *ti, 535static int disk_ctr(struct dm_dirty_log *log, struct dm_target *ti,
464 unsigned int argc, char **argv) 536 unsigned int argc, char **argv)
465{ 537{
466 int r; 538 int r;
467 struct dm_dev *dev; 539 struct dm_dev *dev;
468 540
469 if (argc < 2 || argc > 3) { 541 if (argc < 2 || argc > 3) {
470 DMWARN("wrong number of arguments to disk mirror log"); 542 DMWARN("wrong number of arguments to disk dirty region log");
471 return -EINVAL; 543 return -EINVAL;
472 } 544 }
473 545
@@ -485,7 +557,7 @@ static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
485 return 0; 557 return 0;
486} 558}
487 559
488static void disk_dtr(struct dirty_log *log) 560static void disk_dtr(struct dm_dirty_log *log)
489{ 561{
490 struct log_c *lc = (struct log_c *) log->context; 562 struct log_c *lc = (struct log_c *) log->context;
491 563
@@ -514,7 +586,7 @@ static void fail_log_device(struct log_c *lc)
514 dm_table_event(lc->ti->table); 586 dm_table_event(lc->ti->table);
515} 587}
516 588
517static int disk_resume(struct dirty_log *log) 589static int disk_resume(struct dm_dirty_log *log)
518{ 590{
519 int r; 591 int r;
520 unsigned i; 592 unsigned i;
@@ -524,7 +596,7 @@ static int disk_resume(struct dirty_log *log)
524 /* read the disk header */ 596 /* read the disk header */
525 r = read_header(lc); 597 r = read_header(lc);
526 if (r) { 598 if (r) {
527 DMWARN("%s: Failed to read header on mirror log device", 599 DMWARN("%s: Failed to read header on dirty region log device",
528 lc->log_dev->name); 600 lc->log_dev->name);
529 fail_log_device(lc); 601 fail_log_device(lc);
530 /* 602 /*
@@ -562,7 +634,7 @@ static int disk_resume(struct dirty_log *log)
562 /* write the new header */ 634 /* write the new header */
563 r = write_header(lc); 635 r = write_header(lc);
564 if (r) { 636 if (r) {
565 DMWARN("%s: Failed to write header on mirror log device", 637 DMWARN("%s: Failed to write header on dirty region log device",
566 lc->log_dev->name); 638 lc->log_dev->name);
567 fail_log_device(lc); 639 fail_log_device(lc);
568 } 640 }
@@ -570,38 +642,38 @@ static int disk_resume(struct dirty_log *log)
570 return r; 642 return r;
571} 643}
572 644
573static uint32_t core_get_region_size(struct dirty_log *log) 645static uint32_t core_get_region_size(struct dm_dirty_log *log)
574{ 646{
575 struct log_c *lc = (struct log_c *) log->context; 647 struct log_c *lc = (struct log_c *) log->context;
576 return lc->region_size; 648 return lc->region_size;
577} 649}
578 650
579static int core_resume(struct dirty_log *log) 651static int core_resume(struct dm_dirty_log *log)
580{ 652{
581 struct log_c *lc = (struct log_c *) log->context; 653 struct log_c *lc = (struct log_c *) log->context;
582 lc->sync_search = 0; 654 lc->sync_search = 0;
583 return 0; 655 return 0;
584} 656}
585 657
586static int core_is_clean(struct dirty_log *log, region_t region) 658static int core_is_clean(struct dm_dirty_log *log, region_t region)
587{ 659{
588 struct log_c *lc = (struct log_c *) log->context; 660 struct log_c *lc = (struct log_c *) log->context;
589 return log_test_bit(lc->clean_bits, region); 661 return log_test_bit(lc->clean_bits, region);
590} 662}
591 663
592static int core_in_sync(struct dirty_log *log, region_t region, int block) 664static int core_in_sync(struct dm_dirty_log *log, region_t region, int block)
593{ 665{
594 struct log_c *lc = (struct log_c *) log->context; 666 struct log_c *lc = (struct log_c *) log->context;
595 return log_test_bit(lc->sync_bits, region); 667 return log_test_bit(lc->sync_bits, region);
596} 668}
597 669
598static int core_flush(struct dirty_log *log) 670static int core_flush(struct dm_dirty_log *log)
599{ 671{
600 /* no op */ 672 /* no op */
601 return 0; 673 return 0;
602} 674}
603 675
604static int disk_flush(struct dirty_log *log) 676static int disk_flush(struct dm_dirty_log *log)
605{ 677{
606 int r; 678 int r;
607 struct log_c *lc = (struct log_c *) log->context; 679 struct log_c *lc = (struct log_c *) log->context;
@@ -619,19 +691,19 @@ static int disk_flush(struct dirty_log *log)
619 return r; 691 return r;
620} 692}
621 693
622static void core_mark_region(struct dirty_log *log, region_t region) 694static void core_mark_region(struct dm_dirty_log *log, region_t region)
623{ 695{
624 struct log_c *lc = (struct log_c *) log->context; 696 struct log_c *lc = (struct log_c *) log->context;
625 log_clear_bit(lc, lc->clean_bits, region); 697 log_clear_bit(lc, lc->clean_bits, region);
626} 698}
627 699
628static void core_clear_region(struct dirty_log *log, region_t region) 700static void core_clear_region(struct dm_dirty_log *log, region_t region)
629{ 701{
630 struct log_c *lc = (struct log_c *) log->context; 702 struct log_c *lc = (struct log_c *) log->context;
631 log_set_bit(lc, lc->clean_bits, region); 703 log_set_bit(lc, lc->clean_bits, region);
632} 704}
633 705
634static int core_get_resync_work(struct dirty_log *log, region_t *region) 706static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
635{ 707{
636 struct log_c *lc = (struct log_c *) log->context; 708 struct log_c *lc = (struct log_c *) log->context;
637 709
@@ -654,7 +726,7 @@ static int core_get_resync_work(struct dirty_log *log, region_t *region)
654 return 1; 726 return 1;
655} 727}
656 728
657static void core_set_region_sync(struct dirty_log *log, region_t region, 729static void core_set_region_sync(struct dm_dirty_log *log, region_t region,
658 int in_sync) 730 int in_sync)
659{ 731{
660 struct log_c *lc = (struct log_c *) log->context; 732 struct log_c *lc = (struct log_c *) log->context;
@@ -669,7 +741,7 @@ static void core_set_region_sync(struct dirty_log *log, region_t region,
669 } 741 }
670} 742}
671 743
672static region_t core_get_sync_count(struct dirty_log *log) 744static region_t core_get_sync_count(struct dm_dirty_log *log)
673{ 745{
674 struct log_c *lc = (struct log_c *) log->context; 746 struct log_c *lc = (struct log_c *) log->context;
675 747
@@ -680,7 +752,7 @@ static region_t core_get_sync_count(struct dirty_log *log)
680 if (lc->sync != DEFAULTSYNC) \ 752 if (lc->sync != DEFAULTSYNC) \
681 DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "") 753 DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
682 754
683static int core_status(struct dirty_log *log, status_type_t status, 755static int core_status(struct dm_dirty_log *log, status_type_t status,
684 char *result, unsigned int maxlen) 756 char *result, unsigned int maxlen)
685{ 757{
686 int sz = 0; 758 int sz = 0;
@@ -700,7 +772,7 @@ static int core_status(struct dirty_log *log, status_type_t status,
700 return sz; 772 return sz;
701} 773}
702 774
703static int disk_status(struct dirty_log *log, status_type_t status, 775static int disk_status(struct dm_dirty_log *log, status_type_t status,
704 char *result, unsigned int maxlen) 776 char *result, unsigned int maxlen)
705{ 777{
706 int sz = 0; 778 int sz = 0;
@@ -722,7 +794,7 @@ static int disk_status(struct dirty_log *log, status_type_t status,
722 return sz; 794 return sz;
723} 795}
724 796
725static struct dirty_log_type _core_type = { 797static struct dm_dirty_log_type _core_type = {
726 .name = "core", 798 .name = "core",
727 .module = THIS_MODULE, 799 .module = THIS_MODULE,
728 .ctr = core_ctr, 800 .ctr = core_ctr,
@@ -740,7 +812,7 @@ static struct dirty_log_type _core_type = {
740 .status = core_status, 812 .status = core_status,
741}; 813};
742 814
743static struct dirty_log_type _disk_type = { 815static struct dm_dirty_log_type _disk_type = {
744 .name = "disk", 816 .name = "disk",
745 .module = THIS_MODULE, 817 .module = THIS_MODULE,
746 .ctr = disk_ctr, 818 .ctr = disk_ctr,
@@ -763,26 +835,28 @@ int __init dm_dirty_log_init(void)
763{ 835{
764 int r; 836 int r;
765 837
766 r = dm_register_dirty_log_type(&_core_type); 838 r = dm_dirty_log_type_register(&_core_type);
767 if (r) 839 if (r)
768 DMWARN("couldn't register core log"); 840 DMWARN("couldn't register core log");
769 841
770 r = dm_register_dirty_log_type(&_disk_type); 842 r = dm_dirty_log_type_register(&_disk_type);
771 if (r) { 843 if (r) {
772 DMWARN("couldn't register disk type"); 844 DMWARN("couldn't register disk type");
773 dm_unregister_dirty_log_type(&_core_type); 845 dm_dirty_log_type_unregister(&_core_type);
774 } 846 }
775 847
776 return r; 848 return r;
777} 849}
778 850
779void dm_dirty_log_exit(void) 851void __exit dm_dirty_log_exit(void)
780{ 852{
781 dm_unregister_dirty_log_type(&_disk_type); 853 dm_dirty_log_type_unregister(&_disk_type);
782 dm_unregister_dirty_log_type(&_core_type); 854 dm_dirty_log_type_unregister(&_core_type);
783} 855}
784 856
785EXPORT_SYMBOL(dm_register_dirty_log_type); 857module_init(dm_dirty_log_init);
786EXPORT_SYMBOL(dm_unregister_dirty_log_type); 858module_exit(dm_dirty_log_exit);
787EXPORT_SYMBOL(dm_create_dirty_log); 859
788EXPORT_SYMBOL(dm_destroy_dirty_log); 860MODULE_DESCRIPTION(DM_NAME " dirty region log");
861MODULE_AUTHOR("Joe Thornber, Heinz Mauelshagen <dm-devel@redhat.com>");
862MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h
deleted file mode 100644
index 3fae87eb5963..000000000000
--- a/drivers/md/dm-log.h
+++ /dev/null
@@ -1,131 +0,0 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the LGPL.
5 */
6
7#ifndef DM_DIRTY_LOG
8#define DM_DIRTY_LOG
9
10#include "dm.h"
11
12typedef sector_t region_t;
13
14struct dirty_log_type;
15
16struct dirty_log {
17 struct dirty_log_type *type;
18 void *context;
19};
20
21struct dirty_log_type {
22 struct list_head list;
23 const char *name;
24 struct module *module;
25 unsigned int use_count;
26
27 int (*ctr)(struct dirty_log *log, struct dm_target *ti,
28 unsigned int argc, char **argv);
29 void (*dtr)(struct dirty_log *log);
30
31 /*
32 * There are times when we don't want the log to touch
33 * the disk.
34 */
35 int (*presuspend)(struct dirty_log *log);
36 int (*postsuspend)(struct dirty_log *log);
37 int (*resume)(struct dirty_log *log);
38
39 /*
40 * Retrieves the smallest size of region that the log can
41 * deal with.
42 */
43 uint32_t (*get_region_size)(struct dirty_log *log);
44
45 /*
46 * A predicate to say whether a region is clean or not.
47 * May block.
48 */
49 int (*is_clean)(struct dirty_log *log, region_t region);
50
51 /*
52 * Returns: 0, 1, -EWOULDBLOCK, < 0
53 *
54 * A predicate function to check the area given by
55 * [sector, sector + len) is in sync.
56 *
57 * If -EWOULDBLOCK is returned the state of the region is
58 * unknown, typically this will result in a read being
59 * passed to a daemon to deal with, since a daemon is
60 * allowed to block.
61 */
62 int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
63
64 /*
65 * Flush the current log state (eg, to disk). This
66 * function may block.
67 */
68 int (*flush)(struct dirty_log *log);
69
70 /*
71 * Mark an area as clean or dirty. These functions may
72 * block, though for performance reasons blocking should
73 * be extremely rare (eg, allocating another chunk of
74 * memory for some reason).
75 */
76 void (*mark_region)(struct dirty_log *log, region_t region);
77 void (*clear_region)(struct dirty_log *log, region_t region);
78
79 /*
80 * Returns: <0 (error), 0 (no region), 1 (region)
81 *
82 * The mirrord will need perform recovery on regions of
83 * the mirror that are in the NOSYNC state. This
84 * function asks the log to tell the caller about the
85 * next region that this machine should recover.
86 *
87 * Do not confuse this function with 'in_sync()', one
88 * tells you if an area is synchronised, the other
89 * assigns recovery work.
90 */
91 int (*get_resync_work)(struct dirty_log *log, region_t *region);
92
93 /*
94 * This notifies the log that the resync status of a region
95 * has changed. It also clears the region from the recovering
96 * list (if present).
97 */
98 void (*set_region_sync)(struct dirty_log *log,
99 region_t region, int in_sync);
100
101 /*
102 * Returns the number of regions that are in sync.
103 */
104 region_t (*get_sync_count)(struct dirty_log *log);
105
106 /*
107 * Support function for mirror status requests.
108 */
109 int (*status)(struct dirty_log *log, status_type_t status_type,
110 char *result, unsigned int maxlen);
111};
112
113int dm_register_dirty_log_type(struct dirty_log_type *type);
114int dm_unregister_dirty_log_type(struct dirty_log_type *type);
115
116
117/*
118 * Make sure you use these two functions, rather than calling
119 * type->constructor/destructor() directly.
120 */
121struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
122 unsigned int argc, char **argv);
123void dm_destroy_dirty_log(struct dirty_log *log);
124
125/*
126 * init/exit functions.
127 */
128int dm_dirty_log_init(void);
129void dm_dirty_log_exit(void);
130
131#endif
diff --git a/drivers/md/dm-mpath-hp-sw.c b/drivers/md/dm-mpath-hp-sw.c
index 204bf42c9449..b63a0ab37c53 100644
--- a/drivers/md/dm-mpath-hp-sw.c
+++ b/drivers/md/dm-mpath-hp-sw.c
@@ -137,7 +137,6 @@ static struct request *hp_sw_get_request(struct dm_path *path)
137 req->sense = h->sense; 137 req->sense = h->sense;
138 memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE); 138 memset(req->sense, 0, SCSI_SENSE_BUFFERSIZE);
139 139
140 memset(&req->cmd, 0, BLK_MAX_CDB);
141 req->cmd[0] = START_STOP; 140 req->cmd[0] = START_STOP;
142 req->cmd[4] = 1; 141 req->cmd[4] = 1;
143 req->cmd_len = COMMAND_SIZE(req->cmd[0]); 142 req->cmd_len = COMMAND_SIZE(req->cmd[0]);
diff --git a/drivers/md/dm-mpath-rdac.c b/drivers/md/dm-mpath-rdac.c
index e04eb5c697fb..95e77734880a 100644
--- a/drivers/md/dm-mpath-rdac.c
+++ b/drivers/md/dm-mpath-rdac.c
@@ -284,7 +284,6 @@ static struct request *get_rdac_req(struct rdac_handler *h,
284 return NULL; 284 return NULL;
285 } 285 }
286 286
287 memset(&rq->cmd, 0, BLK_MAX_CDB);
288 rq->sense = h->sense; 287 rq->sense = h->sense;
289 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE); 288 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
290 rq->sense_len = 0; 289 rq->sense_len = 0;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 762cb086bb7f..ff05fe893083 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -7,9 +7,6 @@
7#include "dm.h" 7#include "dm.h"
8#include "dm-bio-list.h" 8#include "dm-bio-list.h"
9#include "dm-bio-record.h" 9#include "dm-bio-record.h"
10#include "dm-io.h"
11#include "dm-log.h"
12#include "kcopyd.h"
13 10
14#include <linux/ctype.h> 11#include <linux/ctype.h>
15#include <linux/init.h> 12#include <linux/init.h>
@@ -22,6 +19,9 @@
22#include <linux/workqueue.h> 19#include <linux/workqueue.h>
23#include <linux/log2.h> 20#include <linux/log2.h>
24#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/dm-io.h>
23#include <linux/dm-dirty-log.h>
24#include <linux/dm-kcopyd.h>
25 25
26#define DM_MSG_PREFIX "raid1" 26#define DM_MSG_PREFIX "raid1"
27#define DM_IO_PAGES 64 27#define DM_IO_PAGES 64
@@ -74,7 +74,7 @@ struct region_hash {
74 unsigned region_shift; 74 unsigned region_shift;
75 75
76 /* holds persistent region state */ 76 /* holds persistent region state */
77 struct dirty_log *log; 77 struct dm_dirty_log *log;
78 78
79 /* hash table */ 79 /* hash table */
80 rwlock_t hash_lock; 80 rwlock_t hash_lock;
@@ -133,7 +133,7 @@ struct mirror_set {
133 struct dm_target *ti; 133 struct dm_target *ti;
134 struct list_head list; 134 struct list_head list;
135 struct region_hash rh; 135 struct region_hash rh;
136 struct kcopyd_client *kcopyd_client; 136 struct dm_kcopyd_client *kcopyd_client;
137 uint64_t features; 137 uint64_t features;
138 138
139 spinlock_t lock; /* protects the lists */ 139 spinlock_t lock; /* protects the lists */
@@ -154,6 +154,9 @@ struct mirror_set {
154 154
155 struct workqueue_struct *kmirrord_wq; 155 struct workqueue_struct *kmirrord_wq;
156 struct work_struct kmirrord_work; 156 struct work_struct kmirrord_work;
157 struct timer_list timer;
158 unsigned long timer_pending;
159
157 struct work_struct trigger_event; 160 struct work_struct trigger_event;
158 161
159 unsigned int nr_mirrors; 162 unsigned int nr_mirrors;
@@ -178,13 +181,32 @@ static void wake(struct mirror_set *ms)
178 queue_work(ms->kmirrord_wq, &ms->kmirrord_work); 181 queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
179} 182}
180 183
184static void delayed_wake_fn(unsigned long data)
185{
186 struct mirror_set *ms = (struct mirror_set *) data;
187
188 clear_bit(0, &ms->timer_pending);
189 wake(ms);
190}
191
192static void delayed_wake(struct mirror_set *ms)
193{
194 if (test_and_set_bit(0, &ms->timer_pending))
195 return;
196
197 ms->timer.expires = jiffies + HZ / 5;
198 ms->timer.data = (unsigned long) ms;
199 ms->timer.function = delayed_wake_fn;
200 add_timer(&ms->timer);
201}
202
181/* FIXME move this */ 203/* FIXME move this */
182static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw); 204static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
183 205
184#define MIN_REGIONS 64 206#define MIN_REGIONS 64
185#define MAX_RECOVERY 1 207#define MAX_RECOVERY 1
186static int rh_init(struct region_hash *rh, struct mirror_set *ms, 208static int rh_init(struct region_hash *rh, struct mirror_set *ms,
187 struct dirty_log *log, uint32_t region_size, 209 struct dm_dirty_log *log, uint32_t region_size,
188 region_t nr_regions) 210 region_t nr_regions)
189{ 211{
190 unsigned int nr_buckets, max_buckets; 212 unsigned int nr_buckets, max_buckets;
@@ -249,7 +271,7 @@ static void rh_exit(struct region_hash *rh)
249 } 271 }
250 272
251 if (rh->log) 273 if (rh->log)
252 dm_destroy_dirty_log(rh->log); 274 dm_dirty_log_destroy(rh->log);
253 if (rh->region_pool) 275 if (rh->region_pool)
254 mempool_destroy(rh->region_pool); 276 mempool_destroy(rh->region_pool);
255 vfree(rh->buckets); 277 vfree(rh->buckets);
@@ -405,24 +427,22 @@ static void rh_update_states(struct region_hash *rh)
405 write_lock_irq(&rh->hash_lock); 427 write_lock_irq(&rh->hash_lock);
406 spin_lock(&rh->region_lock); 428 spin_lock(&rh->region_lock);
407 if (!list_empty(&rh->clean_regions)) { 429 if (!list_empty(&rh->clean_regions)) {
408 list_splice(&rh->clean_regions, &clean); 430 list_splice_init(&rh->clean_regions, &clean);
409 INIT_LIST_HEAD(&rh->clean_regions);
410 431
411 list_for_each_entry(reg, &clean, list) 432 list_for_each_entry(reg, &clean, list)
412 list_del(&reg->hash_list); 433 list_del(&reg->hash_list);
413 } 434 }
414 435
415 if (!list_empty(&rh->recovered_regions)) { 436 if (!list_empty(&rh->recovered_regions)) {
416 list_splice(&rh->recovered_regions, &recovered); 437 list_splice_init(&rh->recovered_regions, &recovered);
417 INIT_LIST_HEAD(&rh->recovered_regions);
418 438
419 list_for_each_entry (reg, &recovered, list) 439 list_for_each_entry (reg, &recovered, list)
420 list_del(&reg->hash_list); 440 list_del(&reg->hash_list);
421 } 441 }
422 442
423 if (!list_empty(&rh->failed_recovered_regions)) { 443 if (!list_empty(&rh->failed_recovered_regions)) {
424 list_splice(&rh->failed_recovered_regions, &failed_recovered); 444 list_splice_init(&rh->failed_recovered_regions,
425 INIT_LIST_HEAD(&rh->failed_recovered_regions); 445 &failed_recovered);
426 446
427 list_for_each_entry(reg, &failed_recovered, list) 447 list_for_each_entry(reg, &failed_recovered, list)
428 list_del(&reg->hash_list); 448 list_del(&reg->hash_list);
@@ -790,7 +810,7 @@ static int recover(struct mirror_set *ms, struct region *reg)
790{ 810{
791 int r; 811 int r;
792 unsigned int i; 812 unsigned int i;
793 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest; 813 struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
794 struct mirror *m; 814 struct mirror *m;
795 unsigned long flags = 0; 815 unsigned long flags = 0;
796 816
@@ -822,9 +842,9 @@ static int recover(struct mirror_set *ms, struct region *reg)
822 } 842 }
823 843
824 /* hand to kcopyd */ 844 /* hand to kcopyd */
825 set_bit(KCOPYD_IGNORE_ERROR, &flags); 845 set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
826 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags, 846 r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
827 recovery_complete, reg); 847 flags, recovery_complete, reg);
828 848
829 return r; 849 return r;
830} 850}
@@ -833,7 +853,7 @@ static void do_recovery(struct mirror_set *ms)
833{ 853{
834 int r; 854 int r;
835 struct region *reg; 855 struct region *reg;
836 struct dirty_log *log = ms->rh.log; 856 struct dm_dirty_log *log = ms->rh.log;
837 857
838 /* 858 /*
839 * Start quiescing some regions. 859 * Start quiescing some regions.
@@ -909,7 +929,7 @@ static void map_bio(struct mirror *m, struct bio *bio)
909 bio->bi_sector = map_sector(m, bio); 929 bio->bi_sector = map_sector(m, bio);
910} 930}
911 931
912static void map_region(struct io_region *io, struct mirror *m, 932static void map_region(struct dm_io_region *io, struct mirror *m,
913 struct bio *bio) 933 struct bio *bio)
914{ 934{
915 io->bdev = m->dev->bdev; 935 io->bdev = m->dev->bdev;
@@ -951,7 +971,7 @@ static void read_callback(unsigned long error, void *context)
951/* Asynchronous read. */ 971/* Asynchronous read. */
952static void read_async_bio(struct mirror *m, struct bio *bio) 972static void read_async_bio(struct mirror *m, struct bio *bio)
953{ 973{
954 struct io_region io; 974 struct dm_io_region io;
955 struct dm_io_request io_req = { 975 struct dm_io_request io_req = {
956 .bi_rw = READ, 976 .bi_rw = READ,
957 .mem.type = DM_IO_BVEC, 977 .mem.type = DM_IO_BVEC,
@@ -1019,7 +1039,7 @@ static void __bio_mark_nosync(struct mirror_set *ms,
1019{ 1039{
1020 unsigned long flags; 1040 unsigned long flags;
1021 struct region_hash *rh = &ms->rh; 1041 struct region_hash *rh = &ms->rh;
1022 struct dirty_log *log = ms->rh.log; 1042 struct dm_dirty_log *log = ms->rh.log;
1023 struct region *reg; 1043 struct region *reg;
1024 region_t region = bio_to_region(rh, bio); 1044 region_t region = bio_to_region(rh, bio);
1025 int recovering = 0; 1045 int recovering = 0;
@@ -1107,7 +1127,7 @@ out:
1107static void do_write(struct mirror_set *ms, struct bio *bio) 1127static void do_write(struct mirror_set *ms, struct bio *bio)
1108{ 1128{
1109 unsigned int i; 1129 unsigned int i;
1110 struct io_region io[ms->nr_mirrors], *dest = io; 1130 struct dm_io_region io[ms->nr_mirrors], *dest = io;
1111 struct mirror *m; 1131 struct mirror *m;
1112 struct dm_io_request io_req = { 1132 struct dm_io_request io_req = {
1113 .bi_rw = WRITE, 1133 .bi_rw = WRITE,
@@ -1182,6 +1202,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
1182 spin_lock_irq(&ms->lock); 1202 spin_lock_irq(&ms->lock);
1183 bio_list_merge(&ms->failures, &sync); 1203 bio_list_merge(&ms->failures, &sync);
1184 spin_unlock_irq(&ms->lock); 1204 spin_unlock_irq(&ms->lock);
1205 wake(ms);
1185 } else 1206 } else
1186 while ((bio = bio_list_pop(&sync))) 1207 while ((bio = bio_list_pop(&sync)))
1187 do_write(ms, bio); 1208 do_write(ms, bio);
@@ -1241,7 +1262,7 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
1241 bio_list_merge(&ms->failures, failures); 1262 bio_list_merge(&ms->failures, failures);
1242 spin_unlock_irq(&ms->lock); 1263 spin_unlock_irq(&ms->lock);
1243 1264
1244 wake(ms); 1265 delayed_wake(ms);
1245} 1266}
1246 1267
1247static void trigger_event(struct work_struct *work) 1268static void trigger_event(struct work_struct *work)
@@ -1255,7 +1276,7 @@ static void trigger_event(struct work_struct *work)
1255/*----------------------------------------------------------------- 1276/*-----------------------------------------------------------------
1256 * kmirrord 1277 * kmirrord
1257 *---------------------------------------------------------------*/ 1278 *---------------------------------------------------------------*/
1258static int _do_mirror(struct work_struct *work) 1279static void do_mirror(struct work_struct *work)
1259{ 1280{
1260 struct mirror_set *ms =container_of(work, struct mirror_set, 1281 struct mirror_set *ms =container_of(work, struct mirror_set,
1261 kmirrord_work); 1282 kmirrord_work);
@@ -1277,23 +1298,7 @@ static int _do_mirror(struct work_struct *work)
1277 do_writes(ms, &writes); 1298 do_writes(ms, &writes);
1278 do_failures(ms, &failures); 1299 do_failures(ms, &failures);
1279 1300
1280 return (ms->failures.head) ? 1 : 0; 1301 dm_table_unplug_all(ms->ti->table);
1281}
1282
1283static void do_mirror(struct work_struct *work)
1284{
1285 /*
1286 * If _do_mirror returns 1, we give it
1287 * another shot. This helps for cases like
1288 * 'suspend' where we call flush_workqueue
1289 * and expect all work to be finished. If
1290 * a failure happens during a suspend, we
1291 * couldn't issue a 'wake' because it would
1292 * not be honored. Therefore, we return '1'
1293 * from _do_mirror, and retry here.
1294 */
1295 while (_do_mirror(work))
1296 schedule();
1297} 1302}
1298 1303
1299 1304
@@ -1303,7 +1308,7 @@ static void do_mirror(struct work_struct *work)
1303static struct mirror_set *alloc_context(unsigned int nr_mirrors, 1308static struct mirror_set *alloc_context(unsigned int nr_mirrors,
1304 uint32_t region_size, 1309 uint32_t region_size,
1305 struct dm_target *ti, 1310 struct dm_target *ti,
1306 struct dirty_log *dl) 1311 struct dm_dirty_log *dl)
1307{ 1312{
1308 size_t len; 1313 size_t len;
1309 struct mirror_set *ms = NULL; 1314 struct mirror_set *ms = NULL;
@@ -1403,12 +1408,12 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
1403/* 1408/*
1404 * Create dirty log: log_type #log_params <log_params> 1409 * Create dirty log: log_type #log_params <log_params>
1405 */ 1410 */
1406static struct dirty_log *create_dirty_log(struct dm_target *ti, 1411static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
1407 unsigned int argc, char **argv, 1412 unsigned int argc, char **argv,
1408 unsigned int *args_used) 1413 unsigned int *args_used)
1409{ 1414{
1410 unsigned int param_count; 1415 unsigned int param_count;
1411 struct dirty_log *dl; 1416 struct dm_dirty_log *dl;
1412 1417
1413 if (argc < 2) { 1418 if (argc < 2) {
1414 ti->error = "Insufficient mirror log arguments"; 1419 ti->error = "Insufficient mirror log arguments";
@@ -1427,7 +1432,7 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
1427 return NULL; 1432 return NULL;
1428 } 1433 }
1429 1434
1430 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2); 1435 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
1431 if (!dl) { 1436 if (!dl) {
1432 ti->error = "Error creating mirror dirty log"; 1437 ti->error = "Error creating mirror dirty log";
1433 return NULL; 1438 return NULL;
@@ -1435,7 +1440,7 @@ static struct dirty_log *create_dirty_log(struct dm_target *ti,
1435 1440
1436 if (!_check_region_size(ti, dl->type->get_region_size(dl))) { 1441 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
1437 ti->error = "Invalid region size"; 1442 ti->error = "Invalid region size";
1438 dm_destroy_dirty_log(dl); 1443 dm_dirty_log_destroy(dl);
1439 return NULL; 1444 return NULL;
1440 } 1445 }
1441 1446
@@ -1496,7 +1501,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1496 int r; 1501 int r;
1497 unsigned int nr_mirrors, m, args_used; 1502 unsigned int nr_mirrors, m, args_used;
1498 struct mirror_set *ms; 1503 struct mirror_set *ms;
1499 struct dirty_log *dl; 1504 struct dm_dirty_log *dl;
1500 1505
1501 dl = create_dirty_log(ti, argc, argv, &args_used); 1506 dl = create_dirty_log(ti, argc, argv, &args_used);
1502 if (!dl) 1507 if (!dl)
@@ -1506,9 +1511,9 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1506 argc -= args_used; 1511 argc -= args_used;
1507 1512
1508 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1513 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1509 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) { 1514 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
1510 ti->error = "Invalid number of mirrors"; 1515 ti->error = "Invalid number of mirrors";
1511 dm_destroy_dirty_log(dl); 1516 dm_dirty_log_destroy(dl);
1512 return -EINVAL; 1517 return -EINVAL;
1513 } 1518 }
1514 1519
@@ -1516,13 +1521,13 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1516 1521
1517 if (argc < nr_mirrors * 2) { 1522 if (argc < nr_mirrors * 2) {
1518 ti->error = "Too few mirror arguments"; 1523 ti->error = "Too few mirror arguments";
1519 dm_destroy_dirty_log(dl); 1524 dm_dirty_log_destroy(dl);
1520 return -EINVAL; 1525 return -EINVAL;
1521 } 1526 }
1522 1527
1523 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); 1528 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
1524 if (!ms) { 1529 if (!ms) {
1525 dm_destroy_dirty_log(dl); 1530 dm_dirty_log_destroy(dl);
1526 return -ENOMEM; 1531 return -ENOMEM;
1527 } 1532 }
1528 1533
@@ -1547,6 +1552,8 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1547 goto err_free_context; 1552 goto err_free_context;
1548 } 1553 }
1549 INIT_WORK(&ms->kmirrord_work, do_mirror); 1554 INIT_WORK(&ms->kmirrord_work, do_mirror);
1555 init_timer(&ms->timer);
1556 ms->timer_pending = 0;
1550 INIT_WORK(&ms->trigger_event, trigger_event); 1557 INIT_WORK(&ms->trigger_event, trigger_event);
1551 1558
1552 r = parse_features(ms, argc, argv, &args_used); 1559 r = parse_features(ms, argc, argv, &args_used);
@@ -1571,7 +1578,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1571 goto err_destroy_wq; 1578 goto err_destroy_wq;
1572 } 1579 }
1573 1580
1574 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client); 1581 r = dm_kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
1575 if (r) 1582 if (r)
1576 goto err_destroy_wq; 1583 goto err_destroy_wq;
1577 1584
@@ -1589,8 +1596,9 @@ static void mirror_dtr(struct dm_target *ti)
1589{ 1596{
1590 struct mirror_set *ms = (struct mirror_set *) ti->private; 1597 struct mirror_set *ms = (struct mirror_set *) ti->private;
1591 1598
1599 del_timer_sync(&ms->timer);
1592 flush_workqueue(ms->kmirrord_wq); 1600 flush_workqueue(ms->kmirrord_wq);
1593 kcopyd_client_destroy(ms->kcopyd_client); 1601 dm_kcopyd_client_destroy(ms->kcopyd_client);
1594 destroy_workqueue(ms->kmirrord_wq); 1602 destroy_workqueue(ms->kmirrord_wq);
1595 free_context(ms, ti, ms->nr_mirrors); 1603 free_context(ms, ti, ms->nr_mirrors);
1596} 1604}
@@ -1734,7 +1742,7 @@ out:
1734static void mirror_presuspend(struct dm_target *ti) 1742static void mirror_presuspend(struct dm_target *ti)
1735{ 1743{
1736 struct mirror_set *ms = (struct mirror_set *) ti->private; 1744 struct mirror_set *ms = (struct mirror_set *) ti->private;
1737 struct dirty_log *log = ms->rh.log; 1745 struct dm_dirty_log *log = ms->rh.log;
1738 1746
1739 atomic_set(&ms->suspend, 1); 1747 atomic_set(&ms->suspend, 1);
1740 1748
@@ -1763,7 +1771,7 @@ static void mirror_presuspend(struct dm_target *ti)
1763static void mirror_postsuspend(struct dm_target *ti) 1771static void mirror_postsuspend(struct dm_target *ti)
1764{ 1772{
1765 struct mirror_set *ms = ti->private; 1773 struct mirror_set *ms = ti->private;
1766 struct dirty_log *log = ms->rh.log; 1774 struct dm_dirty_log *log = ms->rh.log;
1767 1775
1768 if (log->type->postsuspend && log->type->postsuspend(log)) 1776 if (log->type->postsuspend && log->type->postsuspend(log))
1769 /* FIXME: need better error handling */ 1777 /* FIXME: need better error handling */
@@ -1773,7 +1781,7 @@ static void mirror_postsuspend(struct dm_target *ti)
1773static void mirror_resume(struct dm_target *ti) 1781static void mirror_resume(struct dm_target *ti)
1774{ 1782{
1775 struct mirror_set *ms = ti->private; 1783 struct mirror_set *ms = ti->private;
1776 struct dirty_log *log = ms->rh.log; 1784 struct dm_dirty_log *log = ms->rh.log;
1777 1785
1778 atomic_set(&ms->suspend, 0); 1786 atomic_set(&ms->suspend, 0);
1779 if (log->type->resume && log->type->resume(log)) 1787 if (log->type->resume && log->type->resume(log))
@@ -1811,7 +1819,7 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
1811{ 1819{
1812 unsigned int m, sz = 0; 1820 unsigned int m, sz = 0;
1813 struct mirror_set *ms = (struct mirror_set *) ti->private; 1821 struct mirror_set *ms = (struct mirror_set *) ti->private;
1814 struct dirty_log *log = ms->rh.log; 1822 struct dm_dirty_log *log = ms->rh.log;
1815 char buffer[ms->nr_mirrors + 1]; 1823 char buffer[ms->nr_mirrors + 1];
1816 1824
1817 switch (type) { 1825 switch (type) {
@@ -1864,15 +1872,9 @@ static int __init dm_mirror_init(void)
1864{ 1872{
1865 int r; 1873 int r;
1866 1874
1867 r = dm_dirty_log_init();
1868 if (r)
1869 return r;
1870
1871 r = dm_register_target(&mirror_target); 1875 r = dm_register_target(&mirror_target);
1872 if (r < 0) { 1876 if (r < 0)
1873 DMERR("Failed to register mirror target"); 1877 DMERR("Failed to register mirror target");
1874 dm_dirty_log_exit();
1875 }
1876 1878
1877 return r; 1879 return r;
1878} 1880}
@@ -1884,8 +1886,6 @@ static void __exit dm_mirror_exit(void)
1884 r = dm_unregister_target(&mirror_target); 1886 r = dm_unregister_target(&mirror_target);
1885 if (r < 0) 1887 if (r < 0)
1886 DMERR("unregister failed %d", r); 1888 DMERR("unregister failed %d", r);
1887
1888 dm_dirty_log_exit();
1889} 1889}
1890 1890
1891/* Module hooks */ 1891/* Module hooks */
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 4dc8a43c034b..1ba8a47d61b1 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -18,10 +18,10 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/vmalloc.h> 19#include <linux/vmalloc.h>
20#include <linux/log2.h> 20#include <linux/log2.h>
21#include <linux/dm-kcopyd.h>
21 22
22#include "dm-snap.h" 23#include "dm-snap.h"
23#include "dm-bio-list.h" 24#include "dm-bio-list.h"
24#include "kcopyd.h"
25 25
26#define DM_MSG_PREFIX "snapshots" 26#define DM_MSG_PREFIX "snapshots"
27 27
@@ -36,9 +36,9 @@
36#define SNAPSHOT_COPY_PRIORITY 2 36#define SNAPSHOT_COPY_PRIORITY 2
37 37
38/* 38/*
39 * Each snapshot reserves this many pages for io 39 * Reserve 1MB for each snapshot initially (with minimum of 1 page).
40 */ 40 */
41#define SNAPSHOT_PAGES 256 41#define SNAPSHOT_PAGES (((1UL << 20) >> PAGE_SHIFT) ? : 1)
42 42
43static struct workqueue_struct *ksnapd; 43static struct workqueue_struct *ksnapd;
44static void flush_queued_bios(struct work_struct *work); 44static void flush_queued_bios(struct work_struct *work);
@@ -536,7 +536,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
536 s->last_percent = 0; 536 s->last_percent = 0;
537 init_rwsem(&s->lock); 537 init_rwsem(&s->lock);
538 spin_lock_init(&s->pe_lock); 538 spin_lock_init(&s->pe_lock);
539 s->table = ti->table; 539 s->ti = ti;
540 540
541 /* Allocate hash table for COW data */ 541 /* Allocate hash table for COW data */
542 if (init_hash_tables(s)) { 542 if (init_hash_tables(s)) {
@@ -558,7 +558,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
558 goto bad4; 558 goto bad4;
559 } 559 }
560 560
561 r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client); 561 r = dm_kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
562 if (r) { 562 if (r) {
563 ti->error = "Could not create kcopyd client"; 563 ti->error = "Could not create kcopyd client";
564 goto bad5; 564 goto bad5;
@@ -591,7 +591,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
591 return 0; 591 return 0;
592 592
593 bad6: 593 bad6:
594 kcopyd_client_destroy(s->kcopyd_client); 594 dm_kcopyd_client_destroy(s->kcopyd_client);
595 595
596 bad5: 596 bad5:
597 s->store.destroy(&s->store); 597 s->store.destroy(&s->store);
@@ -613,7 +613,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
613 613
614static void __free_exceptions(struct dm_snapshot *s) 614static void __free_exceptions(struct dm_snapshot *s)
615{ 615{
616 kcopyd_client_destroy(s->kcopyd_client); 616 dm_kcopyd_client_destroy(s->kcopyd_client);
617 s->kcopyd_client = NULL; 617 s->kcopyd_client = NULL;
618 618
619 exit_exception_table(&s->pending, pending_cache); 619 exit_exception_table(&s->pending, pending_cache);
@@ -699,7 +699,7 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
699 699
700 s->valid = 0; 700 s->valid = 0;
701 701
702 dm_table_event(s->table); 702 dm_table_event(s->ti->table);
703} 703}
704 704
705static void get_pending_exception(struct dm_snap_pending_exception *pe) 705static void get_pending_exception(struct dm_snap_pending_exception *pe)
@@ -824,7 +824,7 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
824static void start_copy(struct dm_snap_pending_exception *pe) 824static void start_copy(struct dm_snap_pending_exception *pe)
825{ 825{
826 struct dm_snapshot *s = pe->snap; 826 struct dm_snapshot *s = pe->snap;
827 struct io_region src, dest; 827 struct dm_io_region src, dest;
828 struct block_device *bdev = s->origin->bdev; 828 struct block_device *bdev = s->origin->bdev;
829 sector_t dev_size; 829 sector_t dev_size;
830 830
@@ -839,7 +839,7 @@ static void start_copy(struct dm_snap_pending_exception *pe)
839 dest.count = src.count; 839 dest.count = src.count;
840 840
841 /* Hand over to kcopyd */ 841 /* Hand over to kcopyd */
842 kcopyd_copy(s->kcopyd_client, 842 dm_kcopyd_copy(s->kcopyd_client,
843 &src, 1, &dest, 0, copy_callback, pe); 843 &src, 1, &dest, 0, copy_callback, pe);
844} 844}
845 845
@@ -1060,7 +1060,7 @@ static int __origin_write(struct list_head *snapshots, struct bio *bio)
1060 goto next_snapshot; 1060 goto next_snapshot;
1061 1061
1062 /* Nothing to do if writing beyond end of snapshot */ 1062 /* Nothing to do if writing beyond end of snapshot */
1063 if (bio->bi_sector >= dm_table_get_size(snap->table)) 1063 if (bio->bi_sector >= dm_table_get_size(snap->ti->table))
1064 goto next_snapshot; 1064 goto next_snapshot;
1065 1065
1066 /* 1066 /*
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
index 93bce5d49742..24f9fb73b982 100644
--- a/drivers/md/dm-snap.h
+++ b/drivers/md/dm-snap.h
@@ -132,7 +132,7 @@ struct exception_store {
132 132
133struct dm_snapshot { 133struct dm_snapshot {
134 struct rw_semaphore lock; 134 struct rw_semaphore lock;
135 struct dm_table *table; 135 struct dm_target *ti;
136 136
137 struct dm_dev *origin; 137 struct dm_dev *origin;
138 struct dm_dev *cow; 138 struct dm_dev *cow;
@@ -169,7 +169,7 @@ struct dm_snapshot {
169 /* The on disk metadata handler */ 169 /* The on disk metadata handler */
170 struct exception_store store; 170 struct exception_store store;
171 171
172 struct kcopyd_client *kcopyd_client; 172 struct dm_kcopyd_client *kcopyd_client;
173 173
174 /* Queue of snapshot writes for ksnapd to flush */ 174 /* Queue of snapshot writes for ksnapd to flush */
175 struct bio_list queued_bios; 175 struct bio_list queued_bios;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e75b1437b58b..94116eaf4709 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -245,44 +245,6 @@ int dm_table_create(struct dm_table **result, int mode,
245 return 0; 245 return 0;
246} 246}
247 247
248int dm_create_error_table(struct dm_table **result, struct mapped_device *md)
249{
250 struct dm_table *t;
251 sector_t dev_size = 1;
252 int r;
253
254 /*
255 * Find current size of device.
256 * Default to 1 sector if inactive.
257 */
258 t = dm_get_table(md);
259 if (t) {
260 dev_size = dm_table_get_size(t);
261 dm_table_put(t);
262 }
263
264 r = dm_table_create(&t, FMODE_READ, 1, md);
265 if (r)
266 return r;
267
268 r = dm_table_add_target(t, "error", 0, dev_size, NULL);
269 if (r)
270 goto out;
271
272 r = dm_table_complete(t);
273 if (r)
274 goto out;
275
276 *result = t;
277
278out:
279 if (r)
280 dm_table_put(t);
281
282 return r;
283}
284EXPORT_SYMBOL_GPL(dm_create_error_table);
285
286static void free_devices(struct list_head *devices) 248static void free_devices(struct list_head *devices)
287{ 249{
288 struct list_head *tmp, *next; 250 struct list_head *tmp, *next;
@@ -911,10 +873,11 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
911 q->max_hw_sectors = t->limits.max_hw_sectors; 873 q->max_hw_sectors = t->limits.max_hw_sectors;
912 q->seg_boundary_mask = t->limits.seg_boundary_mask; 874 q->seg_boundary_mask = t->limits.seg_boundary_mask;
913 q->bounce_pfn = t->limits.bounce_pfn; 875 q->bounce_pfn = t->limits.bounce_pfn;
876
914 if (t->limits.no_cluster) 877 if (t->limits.no_cluster)
915 q->queue_flags &= ~(1 << QUEUE_FLAG_CLUSTER); 878 queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
916 else 879 else
917 q->queue_flags |= (1 << QUEUE_FLAG_CLUSTER); 880 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
918 881
919} 882}
920 883
@@ -954,7 +917,7 @@ void dm_table_presuspend_targets(struct dm_table *t)
954 if (!t) 917 if (!t)
955 return; 918 return;
956 919
957 return suspend_targets(t, 0); 920 suspend_targets(t, 0);
958} 921}
959 922
960void dm_table_postsuspend_targets(struct dm_table *t) 923void dm_table_postsuspend_targets(struct dm_table *t)
@@ -962,7 +925,7 @@ void dm_table_postsuspend_targets(struct dm_table *t)
962 if (!t) 925 if (!t)
963 return; 926 return;
964 927
965 return suspend_targets(t, 1); 928 suspend_targets(t, 1);
966} 929}
967 930
968int dm_table_resume_targets(struct dm_table *t) 931int dm_table_resume_targets(struct dm_table *t)
diff --git a/drivers/md/dm-uevent.c b/drivers/md/dm-uevent.c
index 50377e5dc2a3..6f65883aef12 100644
--- a/drivers/md/dm-uevent.c
+++ b/drivers/md/dm-uevent.c
@@ -78,7 +78,7 @@ static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md,
78 78
79 event = dm_uevent_alloc(md); 79 event = dm_uevent_alloc(md);
80 if (!event) { 80 if (!event) {
81 DMERR("%s: dm_uevent_alloc() failed", __FUNCTION__); 81 DMERR("%s: dm_uevent_alloc() failed", __func__);
82 goto err_nomem; 82 goto err_nomem;
83 } 83 }
84 84
@@ -86,32 +86,32 @@ static struct dm_uevent *dm_build_path_uevent(struct mapped_device *md,
86 86
87 if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) { 87 if (add_uevent_var(&event->ku_env, "DM_TARGET=%s", ti->type->name)) {
88 DMERR("%s: add_uevent_var() for DM_TARGET failed", 88 DMERR("%s: add_uevent_var() for DM_TARGET failed",
89 __FUNCTION__); 89 __func__);
90 goto err_add; 90 goto err_add;
91 } 91 }
92 92
93 if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) { 93 if (add_uevent_var(&event->ku_env, "DM_ACTION=%s", dm_action)) {
94 DMERR("%s: add_uevent_var() for DM_ACTION failed", 94 DMERR("%s: add_uevent_var() for DM_ACTION failed",
95 __FUNCTION__); 95 __func__);
96 goto err_add; 96 goto err_add;
97 } 97 }
98 98
99 if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u", 99 if (add_uevent_var(&event->ku_env, "DM_SEQNUM=%u",
100 dm_next_uevent_seq(md))) { 100 dm_next_uevent_seq(md))) {
101 DMERR("%s: add_uevent_var() for DM_SEQNUM failed", 101 DMERR("%s: add_uevent_var() for DM_SEQNUM failed",
102 __FUNCTION__); 102 __func__);
103 goto err_add; 103 goto err_add;
104 } 104 }
105 105
106 if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) { 106 if (add_uevent_var(&event->ku_env, "DM_PATH=%s", path)) {
107 DMERR("%s: add_uevent_var() for DM_PATH failed", __FUNCTION__); 107 DMERR("%s: add_uevent_var() for DM_PATH failed", __func__);
108 goto err_add; 108 goto err_add;
109 } 109 }
110 110
111 if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d", 111 if (add_uevent_var(&event->ku_env, "DM_NR_VALID_PATHS=%d",
112 nr_valid_paths)) { 112 nr_valid_paths)) {
113 DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed", 113 DMERR("%s: add_uevent_var() for DM_NR_VALID_PATHS failed",
114 __FUNCTION__); 114 __func__);
115 goto err_add; 115 goto err_add;
116 } 116 }
117 117
@@ -146,25 +146,25 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
146 if (dm_copy_name_and_uuid(event->md, event->name, 146 if (dm_copy_name_and_uuid(event->md, event->name,
147 event->uuid)) { 147 event->uuid)) {
148 DMERR("%s: dm_copy_name_and_uuid() failed", 148 DMERR("%s: dm_copy_name_and_uuid() failed",
149 __FUNCTION__); 149 __func__);
150 goto uevent_free; 150 goto uevent_free;
151 } 151 }
152 152
153 if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) { 153 if (add_uevent_var(&event->ku_env, "DM_NAME=%s", event->name)) {
154 DMERR("%s: add_uevent_var() for DM_NAME failed", 154 DMERR("%s: add_uevent_var() for DM_NAME failed",
155 __FUNCTION__); 155 __func__);
156 goto uevent_free; 156 goto uevent_free;
157 } 157 }
158 158
159 if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) { 159 if (add_uevent_var(&event->ku_env, "DM_UUID=%s", event->uuid)) {
160 DMERR("%s: add_uevent_var() for DM_UUID failed", 160 DMERR("%s: add_uevent_var() for DM_UUID failed",
161 __FUNCTION__); 161 __func__);
162 goto uevent_free; 162 goto uevent_free;
163 } 163 }
164 164
165 r = kobject_uevent_env(kobj, event->action, event->ku_env.envp); 165 r = kobject_uevent_env(kobj, event->action, event->ku_env.envp);
166 if (r) 166 if (r)
167 DMERR("%s: kobject_uevent_env failed", __FUNCTION__); 167 DMERR("%s: kobject_uevent_env failed", __func__);
168uevent_free: 168uevent_free:
169 dm_uevent_free(event); 169 dm_uevent_free(event);
170 } 170 }
@@ -187,7 +187,7 @@ void dm_path_uevent(enum dm_uevent_type event_type, struct dm_target *ti,
187 struct dm_uevent *event; 187 struct dm_uevent *event;
188 188
189 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) { 189 if (event_type >= ARRAY_SIZE(_dm_uevent_type_names)) {
190 DMERR("%s: Invalid event_type %d", __FUNCTION__, event_type); 190 DMERR("%s: Invalid event_type %d", __func__, event_type);
191 goto out; 191 goto out;
192 } 192 }
193 193
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6617ce4af095..372369b1cc20 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -204,6 +204,7 @@ static int (*_inits[])(void) __initdata = {
204 dm_target_init, 204 dm_target_init,
205 dm_linear_init, 205 dm_linear_init,
206 dm_stripe_init, 206 dm_stripe_init,
207 dm_kcopyd_init,
207 dm_interface_init, 208 dm_interface_init,
208}; 209};
209 210
@@ -212,6 +213,7 @@ static void (*_exits[])(void) = {
212 dm_target_exit, 213 dm_target_exit,
213 dm_linear_exit, 214 dm_linear_exit,
214 dm_stripe_exit, 215 dm_stripe_exit,
216 dm_kcopyd_exit,
215 dm_interface_exit, 217 dm_interface_exit,
216}; 218};
217 219
@@ -922,7 +924,7 @@ static void free_minor(int minor)
922/* 924/*
923 * See if the device with a specific minor # is free. 925 * See if the device with a specific minor # is free.
924 */ 926 */
925static int specific_minor(struct mapped_device *md, int minor) 927static int specific_minor(int minor)
926{ 928{
927 int r, m; 929 int r, m;
928 930
@@ -955,7 +957,7 @@ out:
955 return r; 957 return r;
956} 958}
957 959
958static int next_free_minor(struct mapped_device *md, int *minor) 960static int next_free_minor(int *minor)
959{ 961{
960 int r, m; 962 int r, m;
961 963
@@ -966,9 +968,8 @@ static int next_free_minor(struct mapped_device *md, int *minor)
966 spin_lock(&_minor_lock); 968 spin_lock(&_minor_lock);
967 969
968 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 970 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
969 if (r) { 971 if (r)
970 goto out; 972 goto out;
971 }
972 973
973 if (m >= (1 << MINORBITS)) { 974 if (m >= (1 << MINORBITS)) {
974 idr_remove(&_minor_idr, m); 975 idr_remove(&_minor_idr, m);
@@ -991,7 +992,7 @@ static struct block_device_operations dm_blk_dops;
991static struct mapped_device *alloc_dev(int minor) 992static struct mapped_device *alloc_dev(int minor)
992{ 993{
993 int r; 994 int r;
994 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 995 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
995 void *old_md; 996 void *old_md;
996 997
997 if (!md) { 998 if (!md) {
@@ -1004,13 +1005,12 @@ static struct mapped_device *alloc_dev(int minor)
1004 1005
1005 /* get a minor number for the dev */ 1006 /* get a minor number for the dev */
1006 if (minor == DM_ANY_MINOR) 1007 if (minor == DM_ANY_MINOR)
1007 r = next_free_minor(md, &minor); 1008 r = next_free_minor(&minor);
1008 else 1009 else
1009 r = specific_minor(md, minor); 1010 r = specific_minor(minor);
1010 if (r < 0) 1011 if (r < 0)
1011 goto bad_minor; 1012 goto bad_minor;
1012 1013
1013 memset(md, 0, sizeof(*md));
1014 init_rwsem(&md->io_lock); 1014 init_rwsem(&md->io_lock);
1015 mutex_init(&md->suspend_lock); 1015 mutex_init(&md->suspend_lock);
1016 spin_lock_init(&md->pushback_lock); 1016 spin_lock_init(&md->pushback_lock);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b4584a39383b..8c03b634e62e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -16,67 +16,6 @@
16#include <linux/blkdev.h> 16#include <linux/blkdev.h>
17#include <linux/hdreg.h> 17#include <linux/hdreg.h>
18 18
19#define DM_NAME "device-mapper"
20
21#define DMERR(f, arg...) \
22 printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
23#define DMERR_LIMIT(f, arg...) \
24 do { \
25 if (printk_ratelimit()) \
26 printk(KERN_ERR DM_NAME ": " DM_MSG_PREFIX ": " \
27 f "\n", ## arg); \
28 } while (0)
29
30#define DMWARN(f, arg...) \
31 printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
32#define DMWARN_LIMIT(f, arg...) \
33 do { \
34 if (printk_ratelimit()) \
35 printk(KERN_WARNING DM_NAME ": " DM_MSG_PREFIX ": " \
36 f "\n", ## arg); \
37 } while (0)
38
39#define DMINFO(f, arg...) \
40 printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f "\n", ## arg)
41#define DMINFO_LIMIT(f, arg...) \
42 do { \
43 if (printk_ratelimit()) \
44 printk(KERN_INFO DM_NAME ": " DM_MSG_PREFIX ": " f \
45 "\n", ## arg); \
46 } while (0)
47
48#ifdef CONFIG_DM_DEBUG
49# define DMDEBUG(f, arg...) \
50 printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX " DEBUG: " f "\n", ## arg)
51# define DMDEBUG_LIMIT(f, arg...) \
52 do { \
53 if (printk_ratelimit()) \
54 printk(KERN_DEBUG DM_NAME ": " DM_MSG_PREFIX ": " f \
55 "\n", ## arg); \
56 } while (0)
57#else
58# define DMDEBUG(f, arg...) do {} while (0)
59# define DMDEBUG_LIMIT(f, arg...) do {} while (0)
60#endif
61
62#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
63 0 : scnprintf(result + sz, maxlen - sz, x))
64
65#define SECTOR_SHIFT 9
66
67/*
68 * Definitions of return values from target end_io function.
69 */
70#define DM_ENDIO_INCOMPLETE 1
71#define DM_ENDIO_REQUEUE 2
72
73/*
74 * Definitions of return values from target map function.
75 */
76#define DM_MAPIO_SUBMITTED 0
77#define DM_MAPIO_REMAPPED 1
78#define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE
79
80/* 19/*
81 * Suspend feature flags 20 * Suspend feature flags
82 */ 21 */
@@ -136,34 +75,6 @@ static inline int array_too_big(unsigned long fixed, unsigned long obj,
136 return (num > (ULONG_MAX - fixed) / obj); 75 return (num > (ULONG_MAX - fixed) / obj);
137} 76}
138 77
139/*
140 * Ceiling(n / sz)
141 */
142#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
143
144#define dm_sector_div_up(n, sz) ( \
145{ \
146 sector_t _r = ((n) + (sz) - 1); \
147 sector_div(_r, (sz)); \
148 _r; \
149} \
150)
151
152/*
153 * ceiling(n / size) * size
154 */
155#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
156
157static inline sector_t to_sector(unsigned long n)
158{
159 return (n >> 9);
160}
161
162static inline unsigned long to_bytes(sector_t n)
163{
164 return (n << 9);
165}
166
167int dm_split_args(int *argc, char ***argvp, char *input); 78int dm_split_args(int *argc, char ***argvp, char *input);
168 79
169/* 80/*
@@ -189,4 +100,13 @@ int dm_lock_for_deletion(struct mapped_device *md);
189 100
190void dm_kobject_uevent(struct mapped_device *md); 101void dm_kobject_uevent(struct mapped_device *md);
191 102
103/*
104 * Dirty log
105 */
106int dm_dirty_log_init(void);
107void dm_dirty_log_exit(void);
108
109int dm_kcopyd_init(void);
110void dm_kcopyd_exit(void);
111
192#endif 112#endif
diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h
deleted file mode 100644
index 4845f2a0c676..000000000000
--- a/drivers/md/kcopyd.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * Copyright (C) 2001 Sistina Software
3 *
4 * This file is released under the GPL.
5 *
6 * Kcopyd provides a simple interface for copying an area of one
7 * block-device to one or more other block-devices, with an asynchronous
8 * completion notification.
9 */
10
11#ifndef DM_KCOPYD_H
12#define DM_KCOPYD_H
13
14#include "dm-io.h"
15
16/* FIXME: make this configurable */
17#define KCOPYD_MAX_REGIONS 8
18
19#define KCOPYD_IGNORE_ERROR 1
20
21/*
22 * To use kcopyd you must first create a kcopyd client object.
23 */
24struct kcopyd_client;
25int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
26void kcopyd_client_destroy(struct kcopyd_client *kc);
27
28/*
29 * Submit a copy job to kcopyd. This is built on top of the
30 * previous three fns.
31 *
32 * read_err is a boolean,
33 * write_err is a bitset, with 1 bit for each destination region
34 */
35typedef void (*kcopyd_notify_fn)(int read_err, unsigned long write_err,
36 void *context);
37
38int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
39 unsigned int num_dests, struct io_region *dests,
40 unsigned int flags, kcopyd_notify_fn fn, void *context);
41
42#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5ebfb4d79901..83eb78b00137 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -276,13 +276,15 @@ static mddev_t * mddev_find(dev_t unit)
276 init_waitqueue_head(&new->sb_wait); 276 init_waitqueue_head(&new->sb_wait);
277 new->reshape_position = MaxSector; 277 new->reshape_position = MaxSector;
278 new->resync_max = MaxSector; 278 new->resync_max = MaxSector;
279 new->level = LEVEL_NONE;
279 280
280 new->queue = blk_alloc_queue(GFP_KERNEL); 281 new->queue = blk_alloc_queue(GFP_KERNEL);
281 if (!new->queue) { 282 if (!new->queue) {
282 kfree(new); 283 kfree(new);
283 return NULL; 284 return NULL;
284 } 285 }
285 set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags); 286 /* Can be unlocked because the queue is new: no concurrency */
287 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
286 288
287 blk_queue_make_request(new->queue, md_fail_request); 289 blk_queue_make_request(new->queue, md_fail_request);
288 290
@@ -731,9 +733,9 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
731 else 733 else
732 rdev->desc_nr = sb->this_disk.number; 734 rdev->desc_nr = sb->this_disk.number;
733 735
734 if (refdev == 0) 736 if (!refdev) {
735 ret = 1; 737 ret = 1;
736 else { 738 } else {
737 __u64 ev1, ev2; 739 __u64 ev1, ev2;
738 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 740 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
739 if (!uuid_equal(refsb, sb)) { 741 if (!uuid_equal(refsb, sb)) {
@@ -1116,9 +1118,9 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1116 else 1118 else
1117 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1119 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1118 1120
1119 if (refdev == 0) 1121 if (!refdev) {
1120 ret = 1; 1122 ret = 1;
1121 else { 1123 } else {
1122 __u64 ev1, ev2; 1124 __u64 ev1, ev2;
1123 struct mdp_superblock_1 *refsb = 1125 struct mdp_superblock_1 *refsb =
1124 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1126 (struct mdp_superblock_1*)page_address(refdev->sb_page);
@@ -1368,6 +1370,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1368 MD_BUG(); 1370 MD_BUG();
1369 return -EINVAL; 1371 return -EINVAL;
1370 } 1372 }
1373
1374 /* prevent duplicates */
1375 if (find_rdev(mddev, rdev->bdev->bd_dev))
1376 return -EEXIST;
1377
1371 /* make sure rdev->size exceeds mddev->size */ 1378 /* make sure rdev->size exceeds mddev->size */
1372 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) { 1379 if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
1373 if (mddev->pers) { 1380 if (mddev->pers) {
@@ -1651,6 +1658,8 @@ static void md_update_sb(mddev_t * mddev, int force_change)
1651 int sync_req; 1658 int sync_req;
1652 int nospares = 0; 1659 int nospares = 0;
1653 1660
1661 if (mddev->external)
1662 return;
1654repeat: 1663repeat:
1655 spin_lock_irq(&mddev->write_lock); 1664 spin_lock_irq(&mddev->write_lock);
1656 1665
@@ -1819,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page)
1819 len += sprintf(page+len, "%swrite_mostly",sep); 1828 len += sprintf(page+len, "%swrite_mostly",sep);
1820 sep = ","; 1829 sep = ",";
1821 } 1830 }
1831 if (test_bit(Blocked, &rdev->flags)) {
1832 len += sprintf(page+len, "%sblocked", sep);
1833 sep = ",";
1834 }
1822 if (!test_bit(Faulty, &rdev->flags) && 1835 if (!test_bit(Faulty, &rdev->flags) &&
1823 !test_bit(In_sync, &rdev->flags)) { 1836 !test_bit(In_sync, &rdev->flags)) {
1824 len += sprintf(page+len, "%sspare", sep); 1837 len += sprintf(page+len, "%sspare", sep);
@@ -1835,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1835 * remove - disconnects the device 1848 * remove - disconnects the device
1836 * writemostly - sets write_mostly 1849 * writemostly - sets write_mostly
1837 * -writemostly - clears write_mostly 1850 * -writemostly - clears write_mostly
1851 * blocked - sets the Blocked flag
1852 * -blocked - clears the Blocked flag
1838 */ 1853 */
1839 int err = -EINVAL; 1854 int err = -EINVAL;
1840 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 1855 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -1857,6 +1872,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
1857 } else if (cmd_match(buf, "-writemostly")) { 1872 } else if (cmd_match(buf, "-writemostly")) {
1858 clear_bit(WriteMostly, &rdev->flags); 1873 clear_bit(WriteMostly, &rdev->flags);
1859 err = 0; 1874 err = 0;
1875 } else if (cmd_match(buf, "blocked")) {
1876 set_bit(Blocked, &rdev->flags);
1877 err = 0;
1878 } else if (cmd_match(buf, "-blocked")) {
1879 clear_bit(Blocked, &rdev->flags);
1880 wake_up(&rdev->blocked_wait);
1881 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
1882 md_wakeup_thread(rdev->mddev->thread);
1883
1884 err = 0;
1860 } 1885 }
1861 return err ? err : len; 1886 return err ? err : len;
1862} 1887}
@@ -2096,7 +2121,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2096 rv = -EBUSY; 2121 rv = -EBUSY;
2097 else 2122 else
2098 rv = entry->store(rdev, page, length); 2123 rv = entry->store(rdev, page, length);
2099 mddev_unlock(rdev->mddev); 2124 mddev_unlock(mddev);
2100 } 2125 }
2101 return rv; 2126 return rv;
2102} 2127}
@@ -2185,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2185 goto abort_free; 2210 goto abort_free;
2186 } 2211 }
2187 } 2212 }
2213
2188 INIT_LIST_HEAD(&rdev->same_set); 2214 INIT_LIST_HEAD(&rdev->same_set);
2215 init_waitqueue_head(&rdev->blocked_wait);
2189 2216
2190 return rdev; 2217 return rdev;
2191 2218
@@ -2456,7 +2483,6 @@ resync_start_show(mddev_t *mddev, char *page)
2456static ssize_t 2483static ssize_t
2457resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2484resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2458{ 2485{
2459 /* can only set chunk_size if array is not yet active */
2460 char *e; 2486 char *e;
2461 unsigned long long n = simple_strtoull(buf, &e, 10); 2487 unsigned long long n = simple_strtoull(buf, &e, 10);
2462 2488
@@ -2590,15 +2616,20 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2590 err = do_md_stop(mddev, 1); 2616 err = do_md_stop(mddev, 1);
2591 else { 2617 else {
2592 mddev->ro = 1; 2618 mddev->ro = 1;
2619 set_disk_ro(mddev->gendisk, 1);
2593 err = do_md_run(mddev); 2620 err = do_md_run(mddev);
2594 } 2621 }
2595 break; 2622 break;
2596 case read_auto: 2623 case read_auto:
2597 /* stopping an active array */
2598 if (mddev->pers) { 2624 if (mddev->pers) {
2599 err = do_md_stop(mddev, 1); 2625 if (mddev->ro != 1)
2600 if (err == 0) 2626 err = do_md_stop(mddev, 1);
2601 mddev->ro = 2; /* FIXME mark devices writable */ 2627 else
2628 err = restart_array(mddev);
2629 if (err == 0) {
2630 mddev->ro = 2;
2631 set_disk_ro(mddev->gendisk, 0);
2632 }
2602 } else { 2633 } else {
2603 mddev->ro = 2; 2634 mddev->ro = 2;
2604 err = do_md_run(mddev); 2635 err = do_md_run(mddev);
@@ -2611,6 +2642,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2611 if (atomic_read(&mddev->writes_pending) == 0) { 2642 if (atomic_read(&mddev->writes_pending) == 0) {
2612 if (mddev->in_sync == 0) { 2643 if (mddev->in_sync == 0) {
2613 mddev->in_sync = 1; 2644 mddev->in_sync = 1;
2645 if (mddev->safemode == 1)
2646 mddev->safemode = 0;
2614 if (mddev->persistent) 2647 if (mddev->persistent)
2615 set_bit(MD_CHANGE_CLEAN, 2648 set_bit(MD_CHANGE_CLEAN,
2616 &mddev->flags); 2649 &mddev->flags);
@@ -2634,6 +2667,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
2634 err = 0; 2667 err = 0;
2635 } else { 2668 } else {
2636 mddev->ro = 0; 2669 mddev->ro = 0;
2670 set_disk_ro(mddev->gendisk, 0);
2637 err = do_md_run(mddev); 2671 err = do_md_run(mddev);
2638 } 2672 }
2639 break; 2673 break;
@@ -3711,6 +3745,30 @@ static int do_md_stop(mddev_t * mddev, int mode)
3711 mddev->reshape_position = MaxSector; 3745 mddev->reshape_position = MaxSector;
3712 mddev->external = 0; 3746 mddev->external = 0;
3713 mddev->persistent = 0; 3747 mddev->persistent = 0;
3748 mddev->level = LEVEL_NONE;
3749 mddev->clevel[0] = 0;
3750 mddev->flags = 0;
3751 mddev->ro = 0;
3752 mddev->metadata_type[0] = 0;
3753 mddev->chunk_size = 0;
3754 mddev->ctime = mddev->utime = 0;
3755 mddev->layout = 0;
3756 mddev->max_disks = 0;
3757 mddev->events = 0;
3758 mddev->delta_disks = 0;
3759 mddev->new_level = LEVEL_NONE;
3760 mddev->new_layout = 0;
3761 mddev->new_chunk = 0;
3762 mddev->curr_resync = 0;
3763 mddev->resync_mismatches = 0;
3764 mddev->suspend_lo = mddev->suspend_hi = 0;
3765 mddev->sync_speed_min = mddev->sync_speed_max = 0;
3766 mddev->recovery = 0;
3767 mddev->in_sync = 0;
3768 mddev->changed = 0;
3769 mddev->degraded = 0;
3770 mddev->barriers_work = 0;
3771 mddev->safemode = 0;
3714 3772
3715 } else if (mddev->pers) 3773 } else if (mddev->pers)
3716 printk(KERN_INFO "md: %s switched to read-only mode.\n", 3774 printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -4918,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
4918 4976
4919 if (!rdev || test_bit(Faulty, &rdev->flags)) 4977 if (!rdev || test_bit(Faulty, &rdev->flags))
4920 return; 4978 return;
4979
4980 if (mddev->external)
4981 set_bit(Blocked, &rdev->flags);
4921/* 4982/*
4922 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4983 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
4923 mdname(mddev), 4984 mdname(mddev),
@@ -5364,6 +5425,8 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
5364 md_wakeup_thread(mddev->sync_thread); 5425 md_wakeup_thread(mddev->sync_thread);
5365 } 5426 }
5366 atomic_inc(&mddev->writes_pending); 5427 atomic_inc(&mddev->writes_pending);
5428 if (mddev->safemode == 1)
5429 mddev->safemode = 0;
5367 if (mddev->in_sync) { 5430 if (mddev->in_sync) {
5368 spin_lock_irq(&mddev->write_lock); 5431 spin_lock_irq(&mddev->write_lock);
5369 if (mddev->in_sync) { 5432 if (mddev->in_sync) {
@@ -5718,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev)
5718 5781
5719 rdev_for_each(rdev, rtmp, mddev) 5782 rdev_for_each(rdev, rtmp, mddev)
5720 if (rdev->raid_disk >= 0 && 5783 if (rdev->raid_disk >= 0 &&
5721 !mddev->external && 5784 !test_bit(Blocked, &rdev->flags) &&
5722 (test_bit(Faulty, &rdev->flags) || 5785 (test_bit(Faulty, &rdev->flags) ||
5723 ! test_bit(In_sync, &rdev->flags)) && 5786 ! test_bit(In_sync, &rdev->flags)) &&
5724 atomic_read(&rdev->nr_pending)==0) { 5787 atomic_read(&rdev->nr_pending)==0) {
@@ -5788,7 +5851,7 @@ void md_check_recovery(mddev_t *mddev)
5788 return; 5851 return;
5789 5852
5790 if (signal_pending(current)) { 5853 if (signal_pending(current)) {
5791 if (mddev->pers->sync_request) { 5854 if (mddev->pers->sync_request && !mddev->external) {
5792 printk(KERN_INFO "md: %s in immediate safe mode\n", 5855 printk(KERN_INFO "md: %s in immediate safe mode\n",
5793 mdname(mddev)); 5856 mdname(mddev));
5794 mddev->safemode = 2; 5857 mddev->safemode = 2;
@@ -5800,7 +5863,7 @@ void md_check_recovery(mddev_t *mddev)
5800 (mddev->flags && !mddev->external) || 5863 (mddev->flags && !mddev->external) ||
5801 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 5864 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
5802 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 5865 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
5803 (mddev->safemode == 1) || 5866 (mddev->external == 0 && mddev->safemode == 1) ||
5804 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 5867 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
5805 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 5868 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
5806 )) 5869 ))
@@ -5809,16 +5872,20 @@ void md_check_recovery(mddev_t *mddev)
5809 if (mddev_trylock(mddev)) { 5872 if (mddev_trylock(mddev)) {
5810 int spares = 0; 5873 int spares = 0;
5811 5874
5812 spin_lock_irq(&mddev->write_lock); 5875 if (!mddev->external) {
5813 if (mddev->safemode && !atomic_read(&mddev->writes_pending) && 5876 spin_lock_irq(&mddev->write_lock);
5814 !mddev->in_sync && mddev->recovery_cp == MaxSector) { 5877 if (mddev->safemode &&
5815 mddev->in_sync = 1; 5878 !atomic_read(&mddev->writes_pending) &&
5816 if (mddev->persistent) 5879 !mddev->in_sync &&
5817 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 5880 mddev->recovery_cp == MaxSector) {
5881 mddev->in_sync = 1;
5882 if (mddev->persistent)
5883 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
5884 }
5885 if (mddev->safemode == 1)
5886 mddev->safemode = 0;
5887 spin_unlock_irq(&mddev->write_lock);
5818 } 5888 }
5819 if (mddev->safemode == 1)
5820 mddev->safemode = 0;
5821 spin_unlock_irq(&mddev->write_lock);
5822 5889
5823 if (mddev->flags) 5890 if (mddev->flags)
5824 md_update_sb(mddev, 0); 5891 md_update_sb(mddev, 0);
@@ -5913,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev)
5913 } 5980 }
5914} 5981}
5915 5982
5983void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
5984{
5985 sysfs_notify(&rdev->kobj, NULL, "state");
5986 wait_event_timeout(rdev->blocked_wait,
5987 !test_bit(Blocked, &rdev->flags),
5988 msecs_to_jiffies(5000));
5989 rdev_dec_pending(rdev, mddev);
5990}
5991EXPORT_SYMBOL(md_wait_for_blocked_rdev);
5992
5916static int md_notify_reboot(struct notifier_block *this, 5993static int md_notify_reboot(struct notifier_block *this,
5917 unsigned long code, void *x) 5994 unsigned long code, void *x)
5918{ 5995{
@@ -5947,13 +6024,9 @@ static struct notifier_block md_notifier = {
5947 6024
5948static void md_geninit(void) 6025static void md_geninit(void)
5949{ 6026{
5950 struct proc_dir_entry *p;
5951
5952 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6027 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
5953 6028
5954 p = create_proc_entry("mdstat", S_IRUGO, NULL); 6029 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
5955 if (p)
5956 p->proc_fops = &md_seq_fops;
5957} 6030}
5958 6031
5959static int __init md_init(void) 6032static int __init md_init(void)
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3f299d835a2b..42ee1a2dc144 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -244,7 +244,8 @@ static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
244 conf->working_disks--; 244 conf->working_disks--;
245 mddev->degraded++; 245 mddev->degraded++;
246 printk(KERN_ALERT "multipath: IO failure on %s," 246 printk(KERN_ALERT "multipath: IO failure on %s,"
247 " disabling IO path. \n Operation continuing" 247 " disabling IO path.\n"
248 "multipath: Operation continuing"
248 " on %d IO paths.\n", 249 " on %d IO paths.\n",
249 bdevname (rdev->bdev,b), 250 bdevname (rdev->bdev,b),
250 conf->working_disks); 251 conf->working_disks);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ff61b309129a..6778b7cb39bd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -773,7 +773,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
773 r1bio_t *r1_bio; 773 r1bio_t *r1_bio;
774 struct bio *read_bio; 774 struct bio *read_bio;
775 int i, targets = 0, disks; 775 int i, targets = 0, disks;
776 mdk_rdev_t *rdev;
777 struct bitmap *bitmap = mddev->bitmap; 776 struct bitmap *bitmap = mddev->bitmap;
778 unsigned long flags; 777 unsigned long flags;
779 struct bio_list bl; 778 struct bio_list bl;
@@ -781,6 +780,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
781 const int rw = bio_data_dir(bio); 780 const int rw = bio_data_dir(bio);
782 const int do_sync = bio_sync(bio); 781 const int do_sync = bio_sync(bio);
783 int do_barriers; 782 int do_barriers;
783 mdk_rdev_t *blocked_rdev;
784 784
785 /* 785 /*
786 * Register the new request and wait if the reconstruction 786 * Register the new request and wait if the reconstruction
@@ -862,10 +862,17 @@ static int make_request(struct request_queue *q, struct bio * bio)
862 first = 0; 862 first = 0;
863 } 863 }
864#endif 864#endif
865 retry_write:
866 blocked_rdev = NULL;
865 rcu_read_lock(); 867 rcu_read_lock();
866 for (i = 0; i < disks; i++) { 868 for (i = 0; i < disks; i++) {
867 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && 869 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
868 !test_bit(Faulty, &rdev->flags)) { 870 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
871 atomic_inc(&rdev->nr_pending);
872 blocked_rdev = rdev;
873 break;
874 }
875 if (rdev && !test_bit(Faulty, &rdev->flags)) {
869 atomic_inc(&rdev->nr_pending); 876 atomic_inc(&rdev->nr_pending);
870 if (test_bit(Faulty, &rdev->flags)) { 877 if (test_bit(Faulty, &rdev->flags)) {
871 rdev_dec_pending(rdev, mddev); 878 rdev_dec_pending(rdev, mddev);
@@ -878,6 +885,20 @@ static int make_request(struct request_queue *q, struct bio * bio)
878 } 885 }
879 rcu_read_unlock(); 886 rcu_read_unlock();
880 887
888 if (unlikely(blocked_rdev)) {
889 /* Wait for this device to become unblocked */
890 int j;
891
892 for (j = 0; j < i; j++)
893 if (r1_bio->bios[j])
894 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
895
896 allow_barrier(conf);
897 md_wait_for_blocked_rdev(blocked_rdev, mddev);
898 wait_barrier(conf);
899 goto retry_write;
900 }
901
881 BUG_ON(targets == 0); /* we never fail the last device */ 902 BUG_ON(targets == 0); /* we never fail the last device */
882 903
883 if (targets < conf->raid_disks) { 904 if (targets < conf->raid_disks) {
@@ -1008,8 +1029,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1008 } else 1029 } else
1009 set_bit(Faulty, &rdev->flags); 1030 set_bit(Faulty, &rdev->flags);
1010 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1031 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1011 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 1032 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n"
1012 " Operation continuing on %d devices\n", 1033 "raid1: Operation continuing on %d devices.\n",
1013 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1034 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1014} 1035}
1015 1036
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 32389d2f18fc..5938fa962922 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -790,6 +790,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
790 const int do_sync = bio_sync(bio); 790 const int do_sync = bio_sync(bio);
791 struct bio_list bl; 791 struct bio_list bl;
792 unsigned long flags; 792 unsigned long flags;
793 mdk_rdev_t *blocked_rdev;
793 794
794 if (unlikely(bio_barrier(bio))) { 795 if (unlikely(bio_barrier(bio))) {
795 bio_endio(bio, -EOPNOTSUPP); 796 bio_endio(bio, -EOPNOTSUPP);
@@ -879,17 +880,23 @@ static int make_request(struct request_queue *q, struct bio * bio)
879 /* 880 /*
880 * WRITE: 881 * WRITE:
881 */ 882 */
882 /* first select target devices under spinlock and 883 /* first select target devices under rcu_lock and
883 * inc refcount on their rdev. Record them by setting 884 * inc refcount on their rdev. Record them by setting
884 * bios[x] to bio 885 * bios[x] to bio
885 */ 886 */
886 raid10_find_phys(conf, r10_bio); 887 raid10_find_phys(conf, r10_bio);
888 retry_write:
889 blocked_rdev = 0;
887 rcu_read_lock(); 890 rcu_read_lock();
888 for (i = 0; i < conf->copies; i++) { 891 for (i = 0; i < conf->copies; i++) {
889 int d = r10_bio->devs[i].devnum; 892 int d = r10_bio->devs[i].devnum;
890 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 893 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
891 if (rdev && 894 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
892 !test_bit(Faulty, &rdev->flags)) { 895 atomic_inc(&rdev->nr_pending);
896 blocked_rdev = rdev;
897 break;
898 }
899 if (rdev && !test_bit(Faulty, &rdev->flags)) {
893 atomic_inc(&rdev->nr_pending); 900 atomic_inc(&rdev->nr_pending);
894 r10_bio->devs[i].bio = bio; 901 r10_bio->devs[i].bio = bio;
895 } else { 902 } else {
@@ -899,6 +906,22 @@ static int make_request(struct request_queue *q, struct bio * bio)
899 } 906 }
900 rcu_read_unlock(); 907 rcu_read_unlock();
901 908
909 if (unlikely(blocked_rdev)) {
910 /* Have to wait for this device to get unblocked, then retry */
911 int j;
912 int d;
913
914 for (j = 0; j < i; j++)
915 if (r10_bio->devs[j].bio) {
916 d = r10_bio->devs[j].devnum;
917 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
918 }
919 allow_barrier(conf);
920 md_wait_for_blocked_rdev(blocked_rdev, mddev);
921 wait_barrier(conf);
922 goto retry_write;
923 }
924
902 atomic_set(&r10_bio->remaining, 0); 925 atomic_set(&r10_bio->remaining, 0);
903 926
904 bio_list_init(&bl); 927 bio_list_init(&bl);
@@ -1001,8 +1024,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1001 } 1024 }
1002 set_bit(Faulty, &rdev->flags); 1025 set_bit(Faulty, &rdev->flags);
1003 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1026 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1004 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n" 1027 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n"
1005 " Operation continuing on %d devices\n", 1028 "raid10: Operation continuing on %d devices.\n",
1006 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1029 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1007} 1030}
1008 1031
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b162b839a662..087eee0cb809 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
63#define STRIPE_SHIFT (PAGE_SHIFT - 9) 63#define STRIPE_SHIFT (PAGE_SHIFT - 9)
64#define STRIPE_SECTORS (STRIPE_SIZE>>9) 64#define STRIPE_SECTORS (STRIPE_SIZE>>9)
65#define IO_THRESHOLD 1 65#define IO_THRESHOLD 1
66#define BYPASS_THRESHOLD 1
66#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 67#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
67#define HASH_MASK (NR_HASH - 1) 68#define HASH_MASK (NR_HASH - 1)
68 69
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh)
398 399
399 might_sleep(); 400 might_sleep();
400 401
402 set_bit(STRIPE_IO_STARTED, &sh->state);
401 for (i = disks; i--; ) { 403 for (i = disks; i--; ) {
402 int rw; 404 int rw;
403 struct bio *bi; 405 struct bio *bi;
@@ -433,7 +435,7 @@ static void ops_run_io(struct stripe_head *sh)
433 435
434 bi->bi_bdev = rdev->bdev; 436 bi->bi_bdev = rdev->bdev;
435 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 437 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
436 __FUNCTION__, (unsigned long long)sh->sector, 438 __func__, (unsigned long long)sh->sector,
437 bi->bi_rw, i); 439 bi->bi_rw, i);
438 atomic_inc(&sh->count); 440 atomic_inc(&sh->count);
439 bi->bi_sector = sh->sector + rdev->data_offset; 441 bi->bi_sector = sh->sector + rdev->data_offset;
@@ -520,7 +522,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
520 raid5_conf_t *conf = sh->raid_conf; 522 raid5_conf_t *conf = sh->raid_conf;
521 int i; 523 int i;
522 524
523 pr_debug("%s: stripe %llu\n", __FUNCTION__, 525 pr_debug("%s: stripe %llu\n", __func__,
524 (unsigned long long)sh->sector); 526 (unsigned long long)sh->sector);
525 527
526 /* clear completed biofills */ 528 /* clear completed biofills */
@@ -569,7 +571,7 @@ static void ops_run_biofill(struct stripe_head *sh)
569 raid5_conf_t *conf = sh->raid_conf; 571 raid5_conf_t *conf = sh->raid_conf;
570 int i; 572 int i;
571 573
572 pr_debug("%s: stripe %llu\n", __FUNCTION__, 574 pr_debug("%s: stripe %llu\n", __func__,
573 (unsigned long long)sh->sector); 575 (unsigned long long)sh->sector);
574 576
575 for (i = sh->disks; i--; ) { 577 for (i = sh->disks; i--; ) {
@@ -600,7 +602,7 @@ static void ops_complete_compute5(void *stripe_head_ref)
600 int target = sh->ops.target; 602 int target = sh->ops.target;
601 struct r5dev *tgt = &sh->dev[target]; 603 struct r5dev *tgt = &sh->dev[target];
602 604
603 pr_debug("%s: stripe %llu\n", __FUNCTION__, 605 pr_debug("%s: stripe %llu\n", __func__,
604 (unsigned long long)sh->sector); 606 (unsigned long long)sh->sector);
605 607
606 set_bit(R5_UPTODATE, &tgt->flags); 608 set_bit(R5_UPTODATE, &tgt->flags);
@@ -625,7 +627,7 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending)
625 int i; 627 int i;
626 628
627 pr_debug("%s: stripe %llu block: %d\n", 629 pr_debug("%s: stripe %llu block: %d\n",
628 __FUNCTION__, (unsigned long long)sh->sector, target); 630 __func__, (unsigned long long)sh->sector, target);
629 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 631 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
630 632
631 for (i = disks; i--; ) 633 for (i = disks; i--; )
@@ -653,7 +655,7 @@ static void ops_complete_prexor(void *stripe_head_ref)
653{ 655{
654 struct stripe_head *sh = stripe_head_ref; 656 struct stripe_head *sh = stripe_head_ref;
655 657
656 pr_debug("%s: stripe %llu\n", __FUNCTION__, 658 pr_debug("%s: stripe %llu\n", __func__,
657 (unsigned long long)sh->sector); 659 (unsigned long long)sh->sector);
658 660
659 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); 661 set_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
@@ -670,7 +672,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
670 /* existing parity data subtracted */ 672 /* existing parity data subtracted */
671 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 673 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
672 674
673 pr_debug("%s: stripe %llu\n", __FUNCTION__, 675 pr_debug("%s: stripe %llu\n", __func__,
674 (unsigned long long)sh->sector); 676 (unsigned long long)sh->sector);
675 677
676 for (i = disks; i--; ) { 678 for (i = disks; i--; ) {
@@ -699,7 +701,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
699 */ 701 */
700 int prexor = test_bit(STRIPE_OP_PREXOR, &pending); 702 int prexor = test_bit(STRIPE_OP_PREXOR, &pending);
701 703
702 pr_debug("%s: stripe %llu\n", __FUNCTION__, 704 pr_debug("%s: stripe %llu\n", __func__,
703 (unsigned long long)sh->sector); 705 (unsigned long long)sh->sector);
704 706
705 for (i = disks; i--; ) { 707 for (i = disks; i--; ) {
@@ -744,7 +746,7 @@ static void ops_complete_postxor(void *stripe_head_ref)
744{ 746{
745 struct stripe_head *sh = stripe_head_ref; 747 struct stripe_head *sh = stripe_head_ref;
746 748
747 pr_debug("%s: stripe %llu\n", __FUNCTION__, 749 pr_debug("%s: stripe %llu\n", __func__,
748 (unsigned long long)sh->sector); 750 (unsigned long long)sh->sector);
749 751
750 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); 752 set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
@@ -757,7 +759,7 @@ static void ops_complete_write(void *stripe_head_ref)
757 struct stripe_head *sh = stripe_head_ref; 759 struct stripe_head *sh = stripe_head_ref;
758 int disks = sh->disks, i, pd_idx = sh->pd_idx; 760 int disks = sh->disks, i, pd_idx = sh->pd_idx;
759 761
760 pr_debug("%s: stripe %llu\n", __FUNCTION__, 762 pr_debug("%s: stripe %llu\n", __func__,
761 (unsigned long long)sh->sector); 763 (unsigned long long)sh->sector);
762 764
763 for (i = disks; i--; ) { 765 for (i = disks; i--; ) {
@@ -787,7 +789,7 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx,
787 unsigned long flags; 789 unsigned long flags;
788 dma_async_tx_callback callback; 790 dma_async_tx_callback callback;
789 791
790 pr_debug("%s: stripe %llu\n", __FUNCTION__, 792 pr_debug("%s: stripe %llu\n", __func__,
791 (unsigned long long)sh->sector); 793 (unsigned long long)sh->sector);
792 794
793 /* check if prexor is active which means only process blocks 795 /* check if prexor is active which means only process blocks
@@ -837,7 +839,7 @@ static void ops_complete_check(void *stripe_head_ref)
837 struct stripe_head *sh = stripe_head_ref; 839 struct stripe_head *sh = stripe_head_ref;
838 int pd_idx = sh->pd_idx; 840 int pd_idx = sh->pd_idx;
839 841
840 pr_debug("%s: stripe %llu\n", __FUNCTION__, 842 pr_debug("%s: stripe %llu\n", __func__,
841 (unsigned long long)sh->sector); 843 (unsigned long long)sh->sector);
842 844
843 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && 845 if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) &&
@@ -859,7 +861,7 @@ static void ops_run_check(struct stripe_head *sh)
859 int count = 0, pd_idx = sh->pd_idx, i; 861 int count = 0, pd_idx = sh->pd_idx, i;
860 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 862 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
861 863
862 pr_debug("%s: stripe %llu\n", __FUNCTION__, 864 pr_debug("%s: stripe %llu\n", __func__,
863 (unsigned long long)sh->sector); 865 (unsigned long long)sh->sector);
864 866
865 for (i = disks; i--; ) { 867 for (i = disks; i--; ) {
@@ -1260,8 +1262,8 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1260 } 1262 }
1261 set_bit(Faulty, &rdev->flags); 1263 set_bit(Faulty, &rdev->flags);
1262 printk (KERN_ALERT 1264 printk (KERN_ALERT
1263 "raid5: Disk failure on %s, disabling device." 1265 "raid5: Disk failure on %s, disabling device.\n"
1264 " Operation continuing on %d devices\n", 1266 "raid5: Operation continuing on %d devices.\n",
1265 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1267 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1266 } 1268 }
1267} 1269}
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1720 locked++; 1722 locked++;
1721 } 1723 }
1722 } 1724 }
1725 if (locked + 1 == disks)
1726 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
1727 atomic_inc(&sh->raid_conf->pending_full_writes);
1723 } else { 1728 } else {
1724 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 1729 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
1725 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 1730 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1759,7 +1764,7 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
1759 locked++; 1764 locked++;
1760 1765
1761 pr_debug("%s: stripe %llu locked: %d pending: %lx\n", 1766 pr_debug("%s: stripe %llu locked: %d pending: %lx\n",
1762 __FUNCTION__, (unsigned long long)sh->sector, 1767 __func__, (unsigned long long)sh->sector,
1763 locked, sh->ops.pending); 1768 locked, sh->ops.pending);
1764 1769
1765 return locked; 1770 return locked;
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
1947 STRIPE_SECTORS, 0, 0); 1952 STRIPE_SECTORS, 0, 0);
1948 } 1953 }
1949 1954
1955 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
1956 if (atomic_dec_and_test(&conf->pending_full_writes))
1957 md_wakeup_thread(conf->mddev->thread);
1950} 1958}
1951 1959
1952/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks 1960/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
2149 0); 2157 0);
2150 } 2158 }
2151 } 2159 }
2160
2161 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2162 if (atomic_dec_and_test(&conf->pending_full_writes))
2163 md_wakeup_thread(conf->mddev->thread);
2152} 2164}
2153 2165
2154static void handle_issuing_new_write_requests5(raid5_conf_t *conf, 2166static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
2333 s->locked++; 2345 s->locked++;
2334 set_bit(R5_Wantwrite, &sh->dev[i].flags); 2346 set_bit(R5_Wantwrite, &sh->dev[i].flags);
2335 } 2347 }
2348 if (s->locked == disks)
2349 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2350 atomic_inc(&conf->pending_full_writes);
2336 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */ 2351 /* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
2337 set_bit(STRIPE_INSYNC, &sh->state); 2352 set_bit(STRIPE_INSYNC, &sh->state);
2338 2353
@@ -2592,6 +2607,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2592 } 2607 }
2593} 2608}
2594 2609
2610
2595/* 2611/*
2596 * handle_stripe - do things to a stripe. 2612 * handle_stripe - do things to a stripe.
2597 * 2613 *
@@ -2617,6 +2633,7 @@ static void handle_stripe5(struct stripe_head *sh)
2617 struct stripe_head_state s; 2633 struct stripe_head_state s;
2618 struct r5dev *dev; 2634 struct r5dev *dev;
2619 unsigned long pending = 0; 2635 unsigned long pending = 0;
2636 mdk_rdev_t *blocked_rdev = NULL;
2620 2637
2621 memset(&s, 0, sizeof(s)); 2638 memset(&s, 0, sizeof(s));
2622 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " 2639 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2676,6 +2693,11 @@ static void handle_stripe5(struct stripe_head *sh)
2676 if (dev->written) 2693 if (dev->written)
2677 s.written++; 2694 s.written++;
2678 rdev = rcu_dereference(conf->disks[i].rdev); 2695 rdev = rcu_dereference(conf->disks[i].rdev);
2696 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2697 blocked_rdev = rdev;
2698 atomic_inc(&rdev->nr_pending);
2699 break;
2700 }
2679 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2701 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2680 /* The ReadError flag will just be confusing now */ 2702 /* The ReadError flag will just be confusing now */
2681 clear_bit(R5_ReadError, &dev->flags); 2703 clear_bit(R5_ReadError, &dev->flags);
@@ -2690,6 +2712,11 @@ static void handle_stripe5(struct stripe_head *sh)
2690 } 2712 }
2691 rcu_read_unlock(); 2713 rcu_read_unlock();
2692 2714
2715 if (unlikely(blocked_rdev)) {
2716 set_bit(STRIPE_HANDLE, &sh->state);
2717 goto unlock;
2718 }
2719
2693 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2720 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
2694 sh->ops.count++; 2721 sh->ops.count++;
2695 2722
@@ -2879,8 +2906,13 @@ static void handle_stripe5(struct stripe_head *sh)
2879 if (sh->ops.count) 2906 if (sh->ops.count)
2880 pending = get_stripe_work(sh); 2907 pending = get_stripe_work(sh);
2881 2908
2909 unlock:
2882 spin_unlock(&sh->lock); 2910 spin_unlock(&sh->lock);
2883 2911
2912 /* wait for this device to become unblocked */
2913 if (unlikely(blocked_rdev))
2914 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
2915
2884 if (pending) 2916 if (pending)
2885 raid5_run_ops(sh, pending); 2917 raid5_run_ops(sh, pending);
2886 2918
@@ -2897,6 +2929,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2897 struct stripe_head_state s; 2929 struct stripe_head_state s;
2898 struct r6_state r6s; 2930 struct r6_state r6s;
2899 struct r5dev *dev, *pdev, *qdev; 2931 struct r5dev *dev, *pdev, *qdev;
2932 mdk_rdev_t *blocked_rdev = NULL;
2900 2933
2901 r6s.qd_idx = raid6_next_disk(pd_idx, disks); 2934 r6s.qd_idx = raid6_next_disk(pd_idx, disks);
2902 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 2935 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2960,6 +2993,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2960 if (dev->written) 2993 if (dev->written)
2961 s.written++; 2994 s.written++;
2962 rdev = rcu_dereference(conf->disks[i].rdev); 2995 rdev = rcu_dereference(conf->disks[i].rdev);
2996 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
2997 blocked_rdev = rdev;
2998 atomic_inc(&rdev->nr_pending);
2999 break;
3000 }
2963 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3001 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
2964 /* The ReadError flag will just be confusing now */ 3002 /* The ReadError flag will just be confusing now */
2965 clear_bit(R5_ReadError, &dev->flags); 3003 clear_bit(R5_ReadError, &dev->flags);
@@ -2974,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
2974 set_bit(R5_Insync, &dev->flags); 3012 set_bit(R5_Insync, &dev->flags);
2975 } 3013 }
2976 rcu_read_unlock(); 3014 rcu_read_unlock();
3015
3016 if (unlikely(blocked_rdev)) {
3017 set_bit(STRIPE_HANDLE, &sh->state);
3018 goto unlock;
3019 }
2977 pr_debug("locked=%d uptodate=%d to_read=%d" 3020 pr_debug("locked=%d uptodate=%d to_read=%d"
2978 " to_write=%d failed=%d failed_num=%d,%d\n", 3021 " to_write=%d failed=%d failed_num=%d,%d\n",
2979 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3022 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3079,8 +3122,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3079 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 3122 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
3080 handle_stripe_expansion(conf, sh, &r6s); 3123 handle_stripe_expansion(conf, sh, &r6s);
3081 3124
3125 unlock:
3082 spin_unlock(&sh->lock); 3126 spin_unlock(&sh->lock);
3083 3127
3128 /* wait for this device to become unblocked */
3129 if (unlikely(blocked_rdev))
3130 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3131
3084 return_io(return_bi); 3132 return_io(return_bi);
3085 3133
3086 for (i=disks; i-- ;) { 3134 for (i=disks; i-- ;) {
@@ -3094,6 +3142,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
3094 else 3142 else
3095 continue; 3143 continue;
3096 3144
3145 set_bit(STRIPE_IO_STARTED, &sh->state);
3146
3097 bi = &sh->dev[i].req; 3147 bi = &sh->dev[i].req;
3098 3148
3099 bi->bi_rw = rw; 3149 bi->bi_rw = rw;
@@ -3164,7 +3214,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
3164 clear_bit(STRIPE_DELAYED, &sh->state); 3214 clear_bit(STRIPE_DELAYED, &sh->state);
3165 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3215 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3166 atomic_inc(&conf->preread_active_stripes); 3216 atomic_inc(&conf->preread_active_stripes);
3167 list_add_tail(&sh->lru, &conf->handle_list); 3217 list_add_tail(&sh->lru, &conf->hold_list);
3168 } 3218 }
3169 } else 3219 } else
3170 blk_plug_device(conf->mddev->queue); 3220 blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3492,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3442 } 3492 }
3443} 3493}
3444 3494
3495/* __get_priority_stripe - get the next stripe to process
3496 *
3497 * Full stripe writes are allowed to pass preread active stripes up until
3498 * the bypass_threshold is exceeded. In general the bypass_count
3499 * increments when the handle_list is handled before the hold_list; however, it
3500 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3501 * stripe with in flight i/o. The bypass_count will be reset when the
3502 * head of the hold_list has changed, i.e. the head was promoted to the
3503 * handle_list.
3504 */
3505static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3506{
3507 struct stripe_head *sh;
3508
3509 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3510 __func__,
3511 list_empty(&conf->handle_list) ? "empty" : "busy",
3512 list_empty(&conf->hold_list) ? "empty" : "busy",
3513 atomic_read(&conf->pending_full_writes), conf->bypass_count);
3514
3515 if (!list_empty(&conf->handle_list)) {
3516 sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3517
3518 if (list_empty(&conf->hold_list))
3519 conf->bypass_count = 0;
3520 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3521 if (conf->hold_list.next == conf->last_hold)
3522 conf->bypass_count++;
3523 else {
3524 conf->last_hold = conf->hold_list.next;
3525 conf->bypass_count -= conf->bypass_threshold;
3526 if (conf->bypass_count < 0)
3527 conf->bypass_count = 0;
3528 }
3529 }
3530 } else if (!list_empty(&conf->hold_list) &&
3531 ((conf->bypass_threshold &&
3532 conf->bypass_count > conf->bypass_threshold) ||
3533 atomic_read(&conf->pending_full_writes) == 0)) {
3534 sh = list_entry(conf->hold_list.next,
3535 typeof(*sh), lru);
3536 conf->bypass_count -= conf->bypass_threshold;
3537 if (conf->bypass_count < 0)
3538 conf->bypass_count = 0;
3539 } else
3540 return NULL;
3541
3542 list_del_init(&sh->lru);
3543 atomic_inc(&sh->count);
3544 BUG_ON(atomic_read(&sh->count) != 1);
3545 return sh;
3546}
3445 3547
3446static int make_request(struct request_queue *q, struct bio * bi) 3548static int make_request(struct request_queue *q, struct bio * bi)
3447{ 3549{
@@ -3914,7 +4016,6 @@ static void raid5d(mddev_t *mddev)
3914 handled = 0; 4016 handled = 0;
3915 spin_lock_irq(&conf->device_lock); 4017 spin_lock_irq(&conf->device_lock);
3916 while (1) { 4018 while (1) {
3917 struct list_head *first;
3918 struct bio *bio; 4019 struct bio *bio;
3919 4020
3920 if (conf->seq_flush != conf->seq_write) { 4021 if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +4037,12 @@ static void raid5d(mddev_t *mddev)
3936 handled++; 4037 handled++;
3937 } 4038 }
3938 4039
3939 if (list_empty(&conf->handle_list)) { 4040 sh = __get_priority_stripe(conf);
4041
4042 if (!sh) {
3940 async_tx_issue_pending_all(); 4043 async_tx_issue_pending_all();
3941 break; 4044 break;
3942 } 4045 }
3943
3944 first = conf->handle_list.next;
3945 sh = list_entry(first, struct stripe_head, lru);
3946
3947 list_del_init(first);
3948 atomic_inc(&sh->count);
3949 BUG_ON(atomic_read(&sh->count)!= 1);
3950 spin_unlock_irq(&conf->device_lock); 4046 spin_unlock_irq(&conf->device_lock);
3951 4047
3952 handled++; 4048 handled++;
@@ -3978,15 +4074,13 @@ static ssize_t
3978raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4074raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
3979{ 4075{
3980 raid5_conf_t *conf = mddev_to_conf(mddev); 4076 raid5_conf_t *conf = mddev_to_conf(mddev);
3981 char *end; 4077 unsigned long new;
3982 int new;
3983 if (len >= PAGE_SIZE) 4078 if (len >= PAGE_SIZE)
3984 return -EINVAL; 4079 return -EINVAL;
3985 if (!conf) 4080 if (!conf)
3986 return -ENODEV; 4081 return -ENODEV;
3987 4082
3988 new = simple_strtoul(page, &end, 10); 4083 if (strict_strtoul(page, 10, &new))
3989 if (!*page || (*end && *end != '\n') )
3990 return -EINVAL; 4084 return -EINVAL;
3991 if (new <= 16 || new > 32768) 4085 if (new <= 16 || new > 32768)
3992 return -EINVAL; 4086 return -EINVAL;
@@ -4011,6 +4105,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4011 raid5_store_stripe_cache_size); 4105 raid5_store_stripe_cache_size);
4012 4106
4013static ssize_t 4107static ssize_t
4108raid5_show_preread_threshold(mddev_t *mddev, char *page)
4109{
4110 raid5_conf_t *conf = mddev_to_conf(mddev);
4111 if (conf)
4112 return sprintf(page, "%d\n", conf->bypass_threshold);
4113 else
4114 return 0;
4115}
4116
4117static ssize_t
4118raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4119{
4120 raid5_conf_t *conf = mddev_to_conf(mddev);
4121 unsigned long new;
4122 if (len >= PAGE_SIZE)
4123 return -EINVAL;
4124 if (!conf)
4125 return -ENODEV;
4126
4127 if (strict_strtoul(page, 10, &new))
4128 return -EINVAL;
4129 if (new > conf->max_nr_stripes)
4130 return -EINVAL;
4131 conf->bypass_threshold = new;
4132 return len;
4133}
4134
4135static struct md_sysfs_entry
4136raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4137 S_IRUGO | S_IWUSR,
4138 raid5_show_preread_threshold,
4139 raid5_store_preread_threshold);
4140
4141static ssize_t
4014stripe_cache_active_show(mddev_t *mddev, char *page) 4142stripe_cache_active_show(mddev_t *mddev, char *page)
4015{ 4143{
4016 raid5_conf_t *conf = mddev_to_conf(mddev); 4144 raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4026,6 +4154,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4026static struct attribute *raid5_attrs[] = { 4154static struct attribute *raid5_attrs[] = {
4027 &raid5_stripecache_size.attr, 4155 &raid5_stripecache_size.attr,
4028 &raid5_stripecache_active.attr, 4156 &raid5_stripecache_active.attr,
4157 &raid5_preread_bypass_threshold.attr,
4029 NULL, 4158 NULL,
4030}; 4159};
4031static struct attribute_group raid5_attrs_group = { 4160static struct attribute_group raid5_attrs_group = {
@@ -4130,12 +4259,14 @@ static int run(mddev_t *mddev)
4130 init_waitqueue_head(&conf->wait_for_stripe); 4259 init_waitqueue_head(&conf->wait_for_stripe);
4131 init_waitqueue_head(&conf->wait_for_overlap); 4260 init_waitqueue_head(&conf->wait_for_overlap);
4132 INIT_LIST_HEAD(&conf->handle_list); 4261 INIT_LIST_HEAD(&conf->handle_list);
4262 INIT_LIST_HEAD(&conf->hold_list);
4133 INIT_LIST_HEAD(&conf->delayed_list); 4263 INIT_LIST_HEAD(&conf->delayed_list);
4134 INIT_LIST_HEAD(&conf->bitmap_list); 4264 INIT_LIST_HEAD(&conf->bitmap_list);
4135 INIT_LIST_HEAD(&conf->inactive_list); 4265 INIT_LIST_HEAD(&conf->inactive_list);
4136 atomic_set(&conf->active_stripes, 0); 4266 atomic_set(&conf->active_stripes, 0);
4137 atomic_set(&conf->preread_active_stripes, 0); 4267 atomic_set(&conf->preread_active_stripes, 0);
4138 atomic_set(&conf->active_aligned_reads, 0); 4268 atomic_set(&conf->active_aligned_reads, 0);
4269 conf->bypass_threshold = BYPASS_THRESHOLD;
4139 4270
4140 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4271 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4141 4272
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
index 77a6e4bf503d..21987e3dbe6c 100644
--- a/drivers/md/raid6algos.c
+++ b/drivers/md/raid6algos.c
@@ -121,7 +121,8 @@ int __init raid6_select_algo(void)
121 j0 = jiffies; 121 j0 = jiffies;
122 while ( (j1 = jiffies) == j0 ) 122 while ( (j1 = jiffies) == j0 )
123 cpu_relax(); 123 cpu_relax();
124 while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) { 124 while (time_before(jiffies,
125 j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
125 (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs); 126 (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
126 perf++; 127 perf++;
127 } 128 }