aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h2
-rw-r--r--drivers/md/bcache/closure.h2
-rw-r--r--drivers/md/bitmap.c6
-rw-r--r--drivers/md/dm-bufio.c8
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-crypt.c61
-rw-r--r--drivers/md/dm-mpath.c12
-rw-r--r--drivers/md/dm-snap.c4
-rw-r--r--drivers/md/dm-thin.c106
-rw-r--r--drivers/md/dm-verity.c15
-rw-r--r--drivers/md/dm.c3
-rw-r--r--drivers/md/md.c20
-rw-r--r--drivers/md/raid10.c13
-rw-r--r--drivers/md/raid5.c163
-rw-r--r--drivers/md/raid5.h4
15 files changed, 292 insertions, 130 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 82c9c5d35251..d2ebcf323094 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -828,7 +828,7 @@ static inline bool cached_dev_get(struct cached_dev *dc)
828 return false; 828 return false;
829 829
830 /* Paired with the mb in cached_dev_attach */ 830 /* Paired with the mb in cached_dev_attach */
831 smp_mb__after_atomic_inc(); 831 smp_mb__after_atomic();
832 return true; 832 return true;
833} 833}
834 834
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 7ef7461912be..a08e3eeac3c5 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -243,7 +243,7 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
243 cl->fn = fn; 243 cl->fn = fn;
244 cl->wq = wq; 244 cl->wq = wq;
245 /* between atomic_dec() in closure_put() */ 245 /* between atomic_dec() in closure_put() */
246 smp_mb__before_atomic_dec(); 246 smp_mb__before_atomic();
247} 247}
248 248
249static inline void closure_queue(struct closure *cl) 249static inline void closure_queue(struct closure *cl)
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a8e66ae04f5..67f8b31e2054 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
669/* 669/*
670 * return a pointer to the page in the filemap that contains the given bit 670 * return a pointer to the page in the filemap that contains the given bit
671 * 671 *
672 * this lookup is complicated by the fact that the bitmap sb might be exactly
673 * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
674 * 0 or page 1
675 */ 672 */
676static inline struct page *filemap_get_page(struct bitmap_storage *store, 673static inline struct page *filemap_get_page(struct bitmap_storage *store,
677 unsigned long chunk) 674 unsigned long chunk)
678{ 675{
679 if (file_page_index(store, chunk) >= store->file_pages) 676 if (file_page_index(store, chunk) >= store->file_pages)
680 return NULL; 677 return NULL;
681 return store->filemap[file_page_index(store, chunk) 678 return store->filemap[file_page_index(store, chunk)];
682 - file_page_index(store, 0)];
683} 679}
684 680
685static int bitmap_storage_alloc(struct bitmap_storage *store, 681static int bitmap_storage_alloc(struct bitmap_storage *store,
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 66c5d130c8c2..4e84095833db 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -607,9 +607,9 @@ static void write_endio(struct bio *bio, int error)
607 607
608 BUG_ON(!test_bit(B_WRITING, &b->state)); 608 BUG_ON(!test_bit(B_WRITING, &b->state));
609 609
610 smp_mb__before_clear_bit(); 610 smp_mb__before_atomic();
611 clear_bit(B_WRITING, &b->state); 611 clear_bit(B_WRITING, &b->state);
612 smp_mb__after_clear_bit(); 612 smp_mb__after_atomic();
613 613
614 wake_up_bit(&b->state, B_WRITING); 614 wake_up_bit(&b->state, B_WRITING);
615} 615}
@@ -997,9 +997,9 @@ static void read_endio(struct bio *bio, int error)
997 997
998 BUG_ON(!test_bit(B_READING, &b->state)); 998 BUG_ON(!test_bit(B_READING, &b->state));
999 999
1000 smp_mb__before_clear_bit(); 1000 smp_mb__before_atomic();
1001 clear_bit(B_READING, &b->state); 1001 clear_bit(B_READING, &b->state);
1002 smp_mb__after_clear_bit(); 1002 smp_mb__after_atomic();
1003 1003
1004 wake_up_bit(&b->state, B_READING); 1004 wake_up_bit(&b->state, B_READING);
1005} 1005}
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1bf4a71919ec..5f054c44b485 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2178,6 +2178,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
2178 ti->num_discard_bios = 1; 2178 ti->num_discard_bios = 1;
2179 ti->discards_supported = true; 2179 ti->discards_supported = true;
2180 ti->discard_zeroes_data_unsupported = true; 2180 ti->discard_zeroes_data_unsupported = true;
2181 /* Discard bios must be split on a block boundary */
2182 ti->split_discard_bios = true;
2181 2183
2182 cache->features = ca->features; 2184 cache->features = ca->features;
2183 ti->per_bio_data_size = get_per_bio_data_size(cache); 2185 ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2488,6 +2490,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
2488 2490
2489 } else { 2491 } else {
2490 inc_hit_counter(cache, bio); 2492 inc_hit_counter(cache, bio);
2493 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2491 2494
2492 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && 2495 if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
2493 !is_dirty(cache, lookup_result.cblock)) 2496 !is_dirty(cache, lookup_result.cblock))
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 784695d22fde..53b213226c01 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,6 @@
19#include <linux/crypto.h> 19#include <linux/crypto.h>
20#include <linux/workqueue.h> 20#include <linux/workqueue.h>
21#include <linux/backing-dev.h> 21#include <linux/backing-dev.h>
22#include <linux/percpu.h>
23#include <linux/atomic.h> 22#include <linux/atomic.h>
24#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
25#include <asm/page.h> 24#include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
43 struct bvec_iter iter_out; 42 struct bvec_iter iter_out;
44 sector_t cc_sector; 43 sector_t cc_sector;
45 atomic_t cc_pending; 44 atomic_t cc_pending;
45 struct ablkcipher_request *req;
46}; 46};
47 47
48/* 48/*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
111enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID }; 111enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
112 112
113/* 113/*
114 * Duplicated per-CPU state for cipher. 114 * The fields in here must be read only after initialization.
115 */
116struct crypt_cpu {
117 struct ablkcipher_request *req;
118};
119
120/*
121 * The fields in here must be read only after initialization,
122 * changing state should be in crypt_cpu.
123 */ 115 */
124struct crypt_config { 116struct crypt_config {
125 struct dm_dev *dev; 117 struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
150 sector_t iv_offset; 142 sector_t iv_offset;
151 unsigned int iv_size; 143 unsigned int iv_size;
152 144
153 /*
154 * Duplicated per cpu state. Access through
155 * per_cpu_ptr() only.
156 */
157 struct crypt_cpu __percpu *cpu;
158
159 /* ESSIV: struct crypto_cipher *essiv_tfm */ 145 /* ESSIV: struct crypto_cipher *essiv_tfm */
160 void *iv_private; 146 void *iv_private;
161 struct crypto_ablkcipher **tfms; 147 struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
192static void kcryptd_queue_crypt(struct dm_crypt_io *io); 178static void kcryptd_queue_crypt(struct dm_crypt_io *io);
193static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq); 179static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
194 180
195static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
196{
197 return this_cpu_ptr(cc->cpu);
198}
199
200/* 181/*
201 * Use this to access cipher attributes that are the same for each CPU. 182 * Use this to access cipher attributes that are the same for each CPU.
202 */ 183 */
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
903static void crypt_alloc_req(struct crypt_config *cc, 884static void crypt_alloc_req(struct crypt_config *cc,
904 struct convert_context *ctx) 885 struct convert_context *ctx)
905{ 886{
906 struct crypt_cpu *this_cc = this_crypt_config(cc);
907 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); 887 unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
908 888
909 if (!this_cc->req) 889 if (!ctx->req)
910 this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO); 890 ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
911 891
912 ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]); 892 ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
913 ablkcipher_request_set_callback(this_cc->req, 893 ablkcipher_request_set_callback(ctx->req,
914 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, 894 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
915 kcryptd_async_done, dmreq_of_req(cc, this_cc->req)); 895 kcryptd_async_done, dmreq_of_req(cc, ctx->req));
916} 896}
917 897
918/* 898/*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
921static int crypt_convert(struct crypt_config *cc, 901static int crypt_convert(struct crypt_config *cc,
922 struct convert_context *ctx) 902 struct convert_context *ctx)
923{ 903{
924 struct crypt_cpu *this_cc = this_crypt_config(cc);
925 int r; 904 int r;
926 905
927 atomic_set(&ctx->cc_pending, 1); 906 atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc,
932 911
933 atomic_inc(&ctx->cc_pending); 912 atomic_inc(&ctx->cc_pending);
934 913
935 r = crypt_convert_block(cc, ctx, this_cc->req); 914 r = crypt_convert_block(cc, ctx, ctx->req);
936 915
937 switch (r) { 916 switch (r) {
938 /* async */ 917 /* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc,
941 reinit_completion(&ctx->restart); 920 reinit_completion(&ctx->restart);
942 /* fall through*/ 921 /* fall through*/
943 case -EINPROGRESS: 922 case -EINPROGRESS:
944 this_cc->req = NULL; 923 ctx->req = NULL;
945 ctx->cc_sector++; 924 ctx->cc_sector++;
946 continue; 925 continue;
947 926
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
1040 io->sector = sector; 1019 io->sector = sector;
1041 io->error = 0; 1020 io->error = 0;
1042 io->base_io = NULL; 1021 io->base_io = NULL;
1022 io->ctx.req = NULL;
1043 atomic_set(&io->io_pending, 0); 1023 atomic_set(&io->io_pending, 0);
1044 1024
1045 return io; 1025 return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
1065 if (!atomic_dec_and_test(&io->io_pending)) 1045 if (!atomic_dec_and_test(&io->io_pending))
1066 return; 1046 return;
1067 1047
1048 if (io->ctx.req)
1049 mempool_free(io->ctx.req, cc->req_pool);
1068 mempool_free(io, cc->io_pool); 1050 mempool_free(io, cc->io_pool);
1069 1051
1070 if (likely(!base_io)) 1052 if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
1492static void crypt_dtr(struct dm_target *ti) 1474static void crypt_dtr(struct dm_target *ti)
1493{ 1475{
1494 struct crypt_config *cc = ti->private; 1476 struct crypt_config *cc = ti->private;
1495 struct crypt_cpu *cpu_cc;
1496 int cpu;
1497 1477
1498 ti->private = NULL; 1478 ti->private = NULL;
1499 1479
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
1505 if (cc->crypt_queue) 1485 if (cc->crypt_queue)
1506 destroy_workqueue(cc->crypt_queue); 1486 destroy_workqueue(cc->crypt_queue);
1507 1487
1508 if (cc->cpu)
1509 for_each_possible_cpu(cpu) {
1510 cpu_cc = per_cpu_ptr(cc->cpu, cpu);
1511 if (cpu_cc->req)
1512 mempool_free(cpu_cc->req, cc->req_pool);
1513 }
1514
1515 crypt_free_tfms(cc); 1488 crypt_free_tfms(cc);
1516 1489
1517 if (cc->bs) 1490 if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
1530 if (cc->dev) 1503 if (cc->dev)
1531 dm_put_device(ti, cc->dev); 1504 dm_put_device(ti, cc->dev);
1532 1505
1533 if (cc->cpu)
1534 free_percpu(cc->cpu);
1535
1536 kzfree(cc->cipher); 1506 kzfree(cc->cipher);
1537 kzfree(cc->cipher_string); 1507 kzfree(cc->cipher_string);
1538 1508
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1588 if (tmp) 1558 if (tmp)
1589 DMWARN("Ignoring unexpected additional cipher options"); 1559 DMWARN("Ignoring unexpected additional cipher options");
1590 1560
1591 cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
1592 __alignof__(struct crypt_cpu));
1593 if (!cc->cpu) {
1594 ti->error = "Cannot allocate per cpu state";
1595 goto bad_mem;
1596 }
1597
1598 /* 1561 /*
1599 * For compatibility with the original dm-crypt mapping format, if 1562 * For compatibility with the original dm-crypt mapping format, if
1600 * only the cipher name is supplied, use cbc-plain. 1563 * only the cipher name is supplied, use cbc-plain.
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index aa009e865871..ebfa411d1a7d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -445,11 +445,11 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
445 else 445 else
446 m->saved_queue_if_no_path = queue_if_no_path; 446 m->saved_queue_if_no_path = queue_if_no_path;
447 m->queue_if_no_path = queue_if_no_path; 447 m->queue_if_no_path = queue_if_no_path;
448 if (!m->queue_if_no_path)
449 dm_table_run_md_queue_async(m->ti->table);
450
451 spin_unlock_irqrestore(&m->lock, flags); 448 spin_unlock_irqrestore(&m->lock, flags);
452 449
450 if (!queue_if_no_path)
451 dm_table_run_md_queue_async(m->ti->table);
452
453 return 0; 453 return 0;
454} 454}
455 455
@@ -954,7 +954,7 @@ out:
954 */ 954 */
955static int reinstate_path(struct pgpath *pgpath) 955static int reinstate_path(struct pgpath *pgpath)
956{ 956{
957 int r = 0; 957 int r = 0, run_queue = 0;
958 unsigned long flags; 958 unsigned long flags;
959 struct multipath *m = pgpath->pg->m; 959 struct multipath *m = pgpath->pg->m;
960 960
@@ -978,7 +978,7 @@ static int reinstate_path(struct pgpath *pgpath)
978 978
979 if (!m->nr_valid_paths++) { 979 if (!m->nr_valid_paths++) {
980 m->current_pgpath = NULL; 980 m->current_pgpath = NULL;
981 dm_table_run_md_queue_async(m->ti->table); 981 run_queue = 1;
982 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) { 982 } else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
983 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work)) 983 if (queue_work(kmpath_handlerd, &pgpath->activate_path.work))
984 m->pg_init_in_progress++; 984 m->pg_init_in_progress++;
@@ -991,6 +991,8 @@ static int reinstate_path(struct pgpath *pgpath)
991 991
992out: 992out:
993 spin_unlock_irqrestore(&m->lock, flags); 993 spin_unlock_irqrestore(&m->lock, flags);
994 if (run_queue)
995 dm_table_run_md_queue_async(m->ti->table);
994 996
995 return r; 997 return r;
996} 998}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index ebddef5237e4..8e0caed0bf74 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -642,7 +642,7 @@ static void free_pending_exception(struct dm_snap_pending_exception *pe)
642 struct dm_snapshot *s = pe->snap; 642 struct dm_snapshot *s = pe->snap;
643 643
644 mempool_free(pe, s->pending_pool); 644 mempool_free(pe, s->pending_pool);
645 smp_mb__before_atomic_dec(); 645 smp_mb__before_atomic();
646 atomic_dec(&s->pending_exceptions_count); 646 atomic_dec(&s->pending_exceptions_count);
647} 647}
648 648
@@ -783,7 +783,7 @@ static int init_hash_tables(struct dm_snapshot *s)
783static void merge_shutdown(struct dm_snapshot *s) 783static void merge_shutdown(struct dm_snapshot *s)
784{ 784{
785 clear_bit_unlock(RUNNING_MERGE, &s->state_bits); 785 clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
786 smp_mb__after_clear_bit(); 786 smp_mb__after_atomic();
787 wake_up_bit(&s->state_bits, RUNNING_MERGE); 787 wake_up_bit(&s->state_bits, RUNNING_MERGE);
788} 788}
789 789
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 53728be84dee..242ac2ea5f29 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -27,6 +27,9 @@
27#define MAPPING_POOL_SIZE 1024 27#define MAPPING_POOL_SIZE 1024
28#define PRISON_CELLS 1024 28#define PRISON_CELLS 1024
29#define COMMIT_PERIOD HZ 29#define COMMIT_PERIOD HZ
30#define NO_SPACE_TIMEOUT_SECS 60
31
32static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
30 33
31DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, 34DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
32 "A percentage of time allocated for copy on write"); 35 "A percentage of time allocated for copy on write");
@@ -175,6 +178,7 @@ struct pool {
175 struct workqueue_struct *wq; 178 struct workqueue_struct *wq;
176 struct work_struct worker; 179 struct work_struct worker;
177 struct delayed_work waker; 180 struct delayed_work waker;
181 struct delayed_work no_space_timeout;
178 182
179 unsigned long last_commit_jiffies; 183 unsigned long last_commit_jiffies;
180 unsigned ref_count; 184 unsigned ref_count;
@@ -232,6 +236,13 @@ struct thin_c {
232 struct bio_list deferred_bio_list; 236 struct bio_list deferred_bio_list;
233 struct bio_list retry_on_resume_list; 237 struct bio_list retry_on_resume_list;
234 struct rb_root sort_bio_list; /* sorted list of deferred bios */ 238 struct rb_root sort_bio_list; /* sorted list of deferred bios */
239
240 /*
241 * Ensures the thin is not destroyed until the worker has finished
242 * iterating the active_thins list.
243 */
244 atomic_t refcount;
245 struct completion can_destroy;
235}; 246};
236 247
237/*----------------------------------------------------------------*/ 248/*----------------------------------------------------------------*/
@@ -928,7 +939,7 @@ static int commit(struct pool *pool)
928{ 939{
929 int r; 940 int r;
930 941
931 if (get_pool_mode(pool) != PM_WRITE) 942 if (get_pool_mode(pool) >= PM_READ_ONLY)
932 return -EINVAL; 943 return -EINVAL;
933 944
934 r = dm_pool_commit_metadata(pool->pmd); 945 r = dm_pool_commit_metadata(pool->pmd);
@@ -1486,6 +1497,45 @@ static void process_thin_deferred_bios(struct thin_c *tc)
1486 blk_finish_plug(&plug); 1497 blk_finish_plug(&plug);
1487} 1498}
1488 1499
1500static void thin_get(struct thin_c *tc);
1501static void thin_put(struct thin_c *tc);
1502
1503/*
1504 * We can't hold rcu_read_lock() around code that can block. So we
1505 * find a thin with the rcu lock held; bump a refcount; then drop
1506 * the lock.
1507 */
1508static struct thin_c *get_first_thin(struct pool *pool)
1509{
1510 struct thin_c *tc = NULL;
1511
1512 rcu_read_lock();
1513 if (!list_empty(&pool->active_thins)) {
1514 tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
1515 thin_get(tc);
1516 }
1517 rcu_read_unlock();
1518
1519 return tc;
1520}
1521
1522static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
1523{
1524 struct thin_c *old_tc = tc;
1525
1526 rcu_read_lock();
1527 list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
1528 thin_get(tc);
1529 thin_put(old_tc);
1530 rcu_read_unlock();
1531 return tc;
1532 }
1533 thin_put(old_tc);
1534 rcu_read_unlock();
1535
1536 return NULL;
1537}
1538
1489static void process_deferred_bios(struct pool *pool) 1539static void process_deferred_bios(struct pool *pool)
1490{ 1540{
1491 unsigned long flags; 1541 unsigned long flags;
@@ -1493,10 +1543,11 @@ static void process_deferred_bios(struct pool *pool)
1493 struct bio_list bios; 1543 struct bio_list bios;
1494 struct thin_c *tc; 1544 struct thin_c *tc;
1495 1545
1496 rcu_read_lock(); 1546 tc = get_first_thin(pool);
1497 list_for_each_entry_rcu(tc, &pool->active_thins, list) 1547 while (tc) {
1498 process_thin_deferred_bios(tc); 1548 process_thin_deferred_bios(tc);
1499 rcu_read_unlock(); 1549 tc = get_next_thin(pool, tc);
1550 }
1500 1551
1501 /* 1552 /*
1502 * If there are any deferred flush bios, we must commit 1553 * If there are any deferred flush bios, we must commit
@@ -1543,6 +1594,20 @@ static void do_waker(struct work_struct *ws)
1543 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); 1594 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1544} 1595}
1545 1596
1597/*
1598 * We're holding onto IO to allow userland time to react. After the
1599 * timeout either the pool will have been resized (and thus back in
1600 * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
1601 */
1602static void do_no_space_timeout(struct work_struct *ws)
1603{
1604 struct pool *pool = container_of(to_delayed_work(ws), struct pool,
1605 no_space_timeout);
1606
1607 if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
1608 set_pool_mode(pool, PM_READ_ONLY);
1609}
1610
1546/*----------------------------------------------------------------*/ 1611/*----------------------------------------------------------------*/
1547 1612
1548struct noflush_work { 1613struct noflush_work {
@@ -1578,7 +1643,7 @@ static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
1578{ 1643{
1579 struct noflush_work w; 1644 struct noflush_work w;
1580 1645
1581 INIT_WORK(&w.worker, fn); 1646 INIT_WORK_ONSTACK(&w.worker, fn);
1582 w.tc = tc; 1647 w.tc = tc;
1583 atomic_set(&w.complete, 0); 1648 atomic_set(&w.complete, 0);
1584 init_waitqueue_head(&w.wait); 1649 init_waitqueue_head(&w.wait);
@@ -1607,6 +1672,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1607 struct pool_c *pt = pool->ti->private; 1672 struct pool_c *pt = pool->ti->private;
1608 bool needs_check = dm_pool_metadata_needs_check(pool->pmd); 1673 bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
1609 enum pool_mode old_mode = get_pool_mode(pool); 1674 enum pool_mode old_mode = get_pool_mode(pool);
1675 unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
1610 1676
1611 /* 1677 /*
1612 * Never allow the pool to transition to PM_WRITE mode if user 1678 * Never allow the pool to transition to PM_WRITE mode if user
@@ -1668,6 +1734,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
1668 pool->process_discard = process_discard; 1734 pool->process_discard = process_discard;
1669 pool->process_prepared_mapping = process_prepared_mapping; 1735 pool->process_prepared_mapping = process_prepared_mapping;
1670 pool->process_prepared_discard = process_prepared_discard_passdown; 1736 pool->process_prepared_discard = process_prepared_discard_passdown;
1737
1738 if (!pool->pf.error_if_no_space && no_space_timeout)
1739 queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
1671 break; 1740 break;
1672 1741
1673 case PM_WRITE: 1742 case PM_WRITE:
@@ -2053,6 +2122,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
2053 2122
2054 INIT_WORK(&pool->worker, do_worker); 2123 INIT_WORK(&pool->worker, do_worker);
2055 INIT_DELAYED_WORK(&pool->waker, do_waker); 2124 INIT_DELAYED_WORK(&pool->waker, do_waker);
2125 INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2056 spin_lock_init(&pool->lock); 2126 spin_lock_init(&pool->lock);
2057 bio_list_init(&pool->deferred_flush_bios); 2127 bio_list_init(&pool->deferred_flush_bios);
2058 INIT_LIST_HEAD(&pool->prepared_mappings); 2128 INIT_LIST_HEAD(&pool->prepared_mappings);
@@ -2615,6 +2685,7 @@ static void pool_postsuspend(struct dm_target *ti)
2615 struct pool *pool = pt->pool; 2685 struct pool *pool = pt->pool;
2616 2686
2617 cancel_delayed_work(&pool->waker); 2687 cancel_delayed_work(&pool->waker);
2688 cancel_delayed_work(&pool->no_space_timeout);
2618 flush_workqueue(pool->wq); 2689 flush_workqueue(pool->wq);
2619 (void) commit(pool); 2690 (void) commit(pool);
2620} 2691}
@@ -3061,11 +3132,25 @@ static struct target_type pool_target = {
3061/*---------------------------------------------------------------- 3132/*----------------------------------------------------------------
3062 * Thin target methods 3133 * Thin target methods
3063 *--------------------------------------------------------------*/ 3134 *--------------------------------------------------------------*/
3135static void thin_get(struct thin_c *tc)
3136{
3137 atomic_inc(&tc->refcount);
3138}
3139
3140static void thin_put(struct thin_c *tc)
3141{
3142 if (atomic_dec_and_test(&tc->refcount))
3143 complete(&tc->can_destroy);
3144}
3145
3064static void thin_dtr(struct dm_target *ti) 3146static void thin_dtr(struct dm_target *ti)
3065{ 3147{
3066 struct thin_c *tc = ti->private; 3148 struct thin_c *tc = ti->private;
3067 unsigned long flags; 3149 unsigned long flags;
3068 3150
3151 thin_put(tc);
3152 wait_for_completion(&tc->can_destroy);
3153
3069 spin_lock_irqsave(&tc->pool->lock, flags); 3154 spin_lock_irqsave(&tc->pool->lock, flags);
3070 list_del_rcu(&tc->list); 3155 list_del_rcu(&tc->list);
3071 spin_unlock_irqrestore(&tc->pool->lock, flags); 3156 spin_unlock_irqrestore(&tc->pool->lock, flags);
@@ -3101,6 +3186,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3101 struct thin_c *tc; 3186 struct thin_c *tc;
3102 struct dm_dev *pool_dev, *origin_dev; 3187 struct dm_dev *pool_dev, *origin_dev;
3103 struct mapped_device *pool_md; 3188 struct mapped_device *pool_md;
3189 unsigned long flags;
3104 3190
3105 mutex_lock(&dm_thin_pool_table.mutex); 3191 mutex_lock(&dm_thin_pool_table.mutex);
3106 3192
@@ -3191,9 +3277,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
3191 3277
3192 mutex_unlock(&dm_thin_pool_table.mutex); 3278 mutex_unlock(&dm_thin_pool_table.mutex);
3193 3279
3194 spin_lock(&tc->pool->lock); 3280 atomic_set(&tc->refcount, 1);
3281 init_completion(&tc->can_destroy);
3282
3283 spin_lock_irqsave(&tc->pool->lock, flags);
3195 list_add_tail_rcu(&tc->list, &tc->pool->active_thins); 3284 list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
3196 spin_unlock(&tc->pool->lock); 3285 spin_unlock_irqrestore(&tc->pool->lock, flags);
3197 /* 3286 /*
3198 * This synchronize_rcu() call is needed here otherwise we risk a 3287 * This synchronize_rcu() call is needed here otherwise we risk a
3199 * wake_worker() call finding no bios to process (because the newly 3288 * wake_worker() call finding no bios to process (because the newly
@@ -3422,6 +3511,9 @@ static void dm_thin_exit(void)
3422module_init(dm_thin_init); 3511module_init(dm_thin_init);
3423module_exit(dm_thin_exit); 3512module_exit(dm_thin_exit);
3424 3513
3514module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
3515MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
3516
3425MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); 3517MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
3426MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 3518MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3427MODULE_LICENSE("GPL"); 3519MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 796007a5e0e1..7a7bab8947ae 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -330,15 +330,17 @@ test_block_hash:
330 return r; 330 return r;
331 } 331 }
332 } 332 }
333
334 todo = 1 << v->data_dev_block_bits; 333 todo = 1 << v->data_dev_block_bits;
335 while (io->iter.bi_size) { 334 do {
336 u8 *page; 335 u8 *page;
336 unsigned len;
337 struct bio_vec bv = bio_iter_iovec(bio, io->iter); 337 struct bio_vec bv = bio_iter_iovec(bio, io->iter);
338 338
339 page = kmap_atomic(bv.bv_page); 339 page = kmap_atomic(bv.bv_page);
340 r = crypto_shash_update(desc, page + bv.bv_offset, 340 len = bv.bv_len;
341 bv.bv_len); 341 if (likely(len >= todo))
342 len = todo;
343 r = crypto_shash_update(desc, page + bv.bv_offset, len);
342 kunmap_atomic(page); 344 kunmap_atomic(page);
343 345
344 if (r < 0) { 346 if (r < 0) {
@@ -346,8 +348,9 @@ test_block_hash:
346 return r; 348 return r;
347 } 349 }
348 350
349 bio_advance_iter(bio, &io->iter, bv.bv_len); 351 bio_advance_iter(bio, &io->iter, len);
350 } 352 todo -= len;
353 } while (todo);
351 354
352 if (!v->version) { 355 if (!v->version) {
353 r = crypto_shash_update(desc, v->salt, v->salt_size); 356 r = crypto_shash_update(desc, v->salt, v->salt_size);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 455e64916498..aa9e093343d4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
1544 clone->cmd = rq->cmd; 1544 clone->cmd = rq->cmd;
1545 clone->cmd_len = rq->cmd_len; 1545 clone->cmd_len = rq->cmd_len;
1546 clone->sense = rq->sense; 1546 clone->sense = rq->sense;
1547 clone->buffer = rq->buffer;
1548 clone->end_io = end_clone_request; 1547 clone->end_io = end_clone_request;
1549 clone->end_io_data = tio; 1548 clone->end_io_data = tio;
1550 1549
@@ -2447,7 +2446,7 @@ static void dm_wq_work(struct work_struct *work)
2447static void dm_queue_flush(struct mapped_device *md) 2446static void dm_queue_flush(struct mapped_device *md)
2448{ 2447{
2449 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2448 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2450 smp_mb__after_clear_bit(); 2449 smp_mb__after_atomic();
2451 queue_work(md->wq, &md->work); 2450 queue_work(md->wq, &md->work);
2452} 2451}
2453 2452
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8fda38d23e38..34846856dbc6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3448,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3448 mddev->level = LEVEL_NONE; 3448 mddev->level = LEVEL_NONE;
3449 return rv; 3449 return rv;
3450 } 3450 }
3451 if (mddev->ro)
3452 return -EROFS;
3451 3453
3452 /* request to change the personality. Need to ensure: 3454 /* request to change the personality. Need to ensure:
3453 * - array is not engaged in resync/recovery/reshape 3455 * - array is not engaged in resync/recovery/reshape
@@ -3634,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
3634 int err; 3636 int err;
3635 if (mddev->pers->check_reshape == NULL) 3637 if (mddev->pers->check_reshape == NULL)
3636 return -EBUSY; 3638 return -EBUSY;
3639 if (mddev->ro)
3640 return -EROFS;
3637 mddev->new_layout = n; 3641 mddev->new_layout = n;
3638 err = mddev->pers->check_reshape(mddev); 3642 err = mddev->pers->check_reshape(mddev);
3639 if (err) { 3643 if (err) {
@@ -3723,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3723 int err; 3727 int err;
3724 if (mddev->pers->check_reshape == NULL) 3728 if (mddev->pers->check_reshape == NULL)
3725 return -EBUSY; 3729 return -EBUSY;
3730 if (mddev->ro)
3731 return -EROFS;
3726 mddev->new_chunk_sectors = n >> 9; 3732 mddev->new_chunk_sectors = n >> 9;
3727 err = mddev->pers->check_reshape(mddev); 3733 err = mddev->pers->check_reshape(mddev);
3728 if (err) { 3734 if (err) {
@@ -6135,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
6135 */ 6141 */
6136 if (mddev->sync_thread) 6142 if (mddev->sync_thread)
6137 return -EBUSY; 6143 return -EBUSY;
6144 if (mddev->ro)
6145 return -EROFS;
6138 6146
6139 rdev_for_each(rdev, mddev) { 6147 rdev_for_each(rdev, mddev) {
6140 sector_t avail = rdev->sectors; 6148 sector_t avail = rdev->sectors;
@@ -6157,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
6157 /* change the number of raid disks */ 6165 /* change the number of raid disks */
6158 if (mddev->pers->check_reshape == NULL) 6166 if (mddev->pers->check_reshape == NULL)
6159 return -EINVAL; 6167 return -EINVAL;
6168 if (mddev->ro)
6169 return -EROFS;
6160 if (raid_disks <= 0 || 6170 if (raid_disks <= 0 ||
6161 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6171 (mddev->max_disks && raid_disks >= mddev->max_disks))
6162 return -EINVAL; 6172 return -EINVAL;
@@ -7381,8 +7391,10 @@ void md_do_sync(struct md_thread *thread)
7381 /* just incase thread restarts... */ 7391 /* just incase thread restarts... */
7382 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7392 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7383 return; 7393 return;
7384 if (mddev->ro) /* never try to sync a read-only array */ 7394 if (mddev->ro) {/* never try to sync a read-only array */
7395 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7385 return; 7396 return;
7397 }
7386 7398
7387 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7399 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7388 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7400 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
@@ -7824,6 +7836,7 @@ void md_check_recovery(struct mddev *mddev)
7824 /* There is no thread, but we need to call 7836 /* There is no thread, but we need to call
7825 * ->spare_active and clear saved_raid_disk 7837 * ->spare_active and clear saved_raid_disk
7826 */ 7838 */
7839 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7827 md_reap_sync_thread(mddev); 7840 md_reap_sync_thread(mddev);
7828 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7841 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7829 goto unlock; 7842 goto unlock;
@@ -8330,7 +8343,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8330 if (a < s) { 8343 if (a < s) {
8331 /* we need to split this range */ 8344 /* we need to split this range */
8332 if (bb->count >= MD_MAX_BADBLOCKS) { 8345 if (bb->count >= MD_MAX_BADBLOCKS) {
8333 rv = 0; 8346 rv = -ENOSPC;
8334 goto out; 8347 goto out;
8335 } 8348 }
8336 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8349 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
@@ -8516,7 +8529,8 @@ static int md_notify_reboot(struct notifier_block *this,
8516 if (mddev_trylock(mddev)) { 8529 if (mddev_trylock(mddev)) {
8517 if (mddev->pers) 8530 if (mddev->pers)
8518 __md_stop_writes(mddev); 8531 __md_stop_writes(mddev);
8519 mddev->safemode = 2; 8532 if (mddev->persistent)
8533 mddev->safemode = 2;
8520 mddev_unlock(mddev); 8534 mddev_unlock(mddev);
8521 } 8535 }
8522 need_delay = 1; 8536 need_delay = 1;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33fc408e5eac..cb882aae9e20 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
1172 int max_sectors; 1172 int max_sectors;
1173 int sectors; 1173 int sectors;
1174 1174
1175 /*
1176 * Register the new request and wait if the reconstruction
1177 * thread has put up a bar for new requests.
1178 * Continue immediately if no resync is active currently.
1179 */
1180 wait_barrier(conf);
1181
1175 sectors = bio_sectors(bio); 1182 sectors = bio_sectors(bio);
1176 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1183 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1177 bio->bi_iter.bi_sector < conf->reshape_progress && 1184 bio->bi_iter.bi_sector < conf->reshape_progress &&
@@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio)
1552 1559
1553 md_write_start(mddev, bio); 1560 md_write_start(mddev, bio);
1554 1561
1555 /*
1556 * Register the new request and wait if the reconstruction
1557 * thread has put up a bar for new requests.
1558 * Continue immediately if no resync is active currently.
1559 */
1560 wait_barrier(conf);
1561 1562
1562 do { 1563 do {
1563 1564
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 25247a852912..6234b2e84587 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
292 BUG_ON(atomic_read(&conf->active_stripes)==0); 292 BUG_ON(atomic_read(&conf->active_stripes)==0);
293 if (test_bit(STRIPE_HANDLE, &sh->state)) { 293 if (test_bit(STRIPE_HANDLE, &sh->state)) {
294 if (test_bit(STRIPE_DELAYED, &sh->state) && 294 if (test_bit(STRIPE_DELAYED, &sh->state) &&
295 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 295 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
296 list_add_tail(&sh->lru, &conf->delayed_list); 296 list_add_tail(&sh->lru, &conf->delayed_list);
297 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 297 if (atomic_read(&conf->preread_active_stripes)
298 < IO_THRESHOLD)
299 md_wakeup_thread(conf->mddev->thread);
300 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
298 sh->bm_seq - conf->seq_write > 0) 301 sh->bm_seq - conf->seq_write > 0)
299 list_add_tail(&sh->lru, &conf->bitmap_list); 302 list_add_tail(&sh->lru, &conf->bitmap_list);
300 else { 303 else {
@@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh)
413 int hash; 416 int hash;
414 bool wakeup; 417 bool wakeup;
415 418
419 /* Avoid release_list until the last reference.
420 */
421 if (atomic_add_unless(&sh->count, -1, 1))
422 return;
423
416 if (unlikely(!conf->mddev->thread) || 424 if (unlikely(!conf->mddev->thread) ||
417 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 425 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
418 goto slow_path; 426 goto slow_path;
@@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
479 int num = sh->raid_conf->pool_size; 487 int num = sh->raid_conf->pool_size;
480 488
481 for (i = 0; i < num ; i++) { 489 for (i = 0; i < num ; i++) {
490 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
482 p = sh->dev[i].page; 491 p = sh->dev[i].page;
483 if (!p) 492 if (!p)
484 continue; 493 continue;
@@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
499 return 1; 508 return 1;
500 } 509 }
501 sh->dev[i].page = page; 510 sh->dev[i].page = page;
511 sh->dev[i].orig_page = page;
502 } 512 }
503 return 0; 513 return 0;
504} 514}
@@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
855 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 865 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
856 bi->bi_rw |= REQ_NOMERGE; 866 bi->bi_rw |= REQ_NOMERGE;
857 867
868 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
869 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
870 sh->dev[i].vec.bv_page = sh->dev[i].page;
858 bi->bi_vcnt = 1; 871 bi->bi_vcnt = 1;
859 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 872 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
860 bi->bi_io_vec[0].bv_offset = 0; 873 bi->bi_io_vec[0].bv_offset = 0;
@@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
899 else 912 else
900 rbi->bi_iter.bi_sector = (sh->sector 913 rbi->bi_iter.bi_sector = (sh->sector
901 + rrdev->data_offset); 914 + rrdev->data_offset);
915 if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
916 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
917 sh->dev[i].rvec.bv_page = sh->dev[i].page;
902 rbi->bi_vcnt = 1; 918 rbi->bi_vcnt = 1;
903 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 919 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
904 rbi->bi_io_vec[0].bv_offset = 0; 920 rbi->bi_io_vec[0].bv_offset = 0;
@@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
927} 943}
928 944
929static struct dma_async_tx_descriptor * 945static struct dma_async_tx_descriptor *
930async_copy_data(int frombio, struct bio *bio, struct page *page, 946async_copy_data(int frombio, struct bio *bio, struct page **page,
931 sector_t sector, struct dma_async_tx_descriptor *tx) 947 sector_t sector, struct dma_async_tx_descriptor *tx,
948 struct stripe_head *sh)
932{ 949{
933 struct bio_vec bvl; 950 struct bio_vec bvl;
934 struct bvec_iter iter; 951 struct bvec_iter iter;
@@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
965 if (clen > 0) { 982 if (clen > 0) {
966 b_offset += bvl.bv_offset; 983 b_offset += bvl.bv_offset;
967 bio_page = bvl.bv_page; 984 bio_page = bvl.bv_page;
968 if (frombio) 985 if (frombio) {
969 tx = async_memcpy(page, bio_page, page_offset, 986 if (sh->raid_conf->skip_copy &&
987 b_offset == 0 && page_offset == 0 &&
988 clen == STRIPE_SIZE)
989 *page = bio_page;
990 else
991 tx = async_memcpy(*page, bio_page, page_offset,
970 b_offset, clen, &submit); 992 b_offset, clen, &submit);
971 else 993 } else
972 tx = async_memcpy(bio_page, page, b_offset, 994 tx = async_memcpy(bio_page, *page, b_offset,
973 page_offset, clen, &submit); 995 page_offset, clen, &submit);
974 } 996 }
975 /* chain the operations */ 997 /* chain the operations */
@@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh)
1045 spin_unlock_irq(&sh->stripe_lock); 1067 spin_unlock_irq(&sh->stripe_lock);
1046 while (rbi && rbi->bi_iter.bi_sector < 1068 while (rbi && rbi->bi_iter.bi_sector <
1047 dev->sector + STRIPE_SECTORS) { 1069 dev->sector + STRIPE_SECTORS) {
1048 tx = async_copy_data(0, rbi, dev->page, 1070 tx = async_copy_data(0, rbi, &dev->page,
1049 dev->sector, tx); 1071 dev->sector, tx, sh);
1050 rbi = r5_next_bio(rbi, dev->sector); 1072 rbi = r5_next_bio(rbi, dev->sector);
1051 } 1073 }
1052 } 1074 }
@@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1384 BUG_ON(dev->written); 1406 BUG_ON(dev->written);
1385 wbi = dev->written = chosen; 1407 wbi = dev->written = chosen;
1386 spin_unlock_irq(&sh->stripe_lock); 1408 spin_unlock_irq(&sh->stripe_lock);
1409 WARN_ON(dev->page != dev->orig_page);
1387 1410
1388 while (wbi && wbi->bi_iter.bi_sector < 1411 while (wbi && wbi->bi_iter.bi_sector <
1389 dev->sector + STRIPE_SECTORS) { 1412 dev->sector + STRIPE_SECTORS) {
@@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1393 set_bit(R5_SyncIO, &dev->flags); 1416 set_bit(R5_SyncIO, &dev->flags);
1394 if (wbi->bi_rw & REQ_DISCARD) 1417 if (wbi->bi_rw & REQ_DISCARD)
1395 set_bit(R5_Discard, &dev->flags); 1418 set_bit(R5_Discard, &dev->flags);
1396 else 1419 else {
1397 tx = async_copy_data(1, wbi, dev->page, 1420 tx = async_copy_data(1, wbi, &dev->page,
1398 dev->sector, tx); 1421 dev->sector, tx, sh);
1422 if (dev->page != dev->orig_page) {
1423 set_bit(R5_SkipCopy, &dev->flags);
1424 clear_bit(R5_UPTODATE, &dev->flags);
1425 clear_bit(R5_OVERWRITE, &dev->flags);
1426 }
1427 }
1399 wbi = r5_next_bio(wbi, dev->sector); 1428 wbi = r5_next_bio(wbi, dev->sector);
1400 } 1429 }
1401 } 1430 }
@@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1426 struct r5dev *dev = &sh->dev[i]; 1455 struct r5dev *dev = &sh->dev[i];
1427 1456
1428 if (dev->written || i == pd_idx || i == qd_idx) { 1457 if (dev->written || i == pd_idx || i == qd_idx) {
1429 if (!discard) 1458 if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1430 set_bit(R5_UPTODATE, &dev->flags); 1459 set_bit(R5_UPTODATE, &dev->flags);
1431 if (fua) 1460 if (fua)
1432 set_bit(R5_WantFUA, &dev->flags); 1461 set_bit(R5_WantFUA, &dev->flags);
@@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1839 osh = get_free_stripe(conf, hash); 1868 osh = get_free_stripe(conf, hash);
1840 unlock_device_hash_lock(conf, hash); 1869 unlock_device_hash_lock(conf, hash);
1841 atomic_set(&nsh->count, 1); 1870 atomic_set(&nsh->count, 1);
1842 for(i=0; i<conf->pool_size; i++) 1871 for(i=0; i<conf->pool_size; i++) {
1843 nsh->dev[i].page = osh->dev[i].page; 1872 nsh->dev[i].page = osh->dev[i].page;
1873 nsh->dev[i].orig_page = osh->dev[i].page;
1874 }
1844 for( ; i<newsize; i++) 1875 for( ; i<newsize; i++)
1845 nsh->dev[i].page = NULL; 1876 nsh->dev[i].page = NULL;
1846 nsh->hash_lock_index = hash; 1877 nsh->hash_lock_index = hash;
@@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1896 if (nsh->dev[i].page == NULL) { 1927 if (nsh->dev[i].page == NULL) {
1897 struct page *p = alloc_page(GFP_NOIO); 1928 struct page *p = alloc_page(GFP_NOIO);
1898 nsh->dev[i].page = p; 1929 nsh->dev[i].page = p;
1930 nsh->dev[i].orig_page = p;
1899 if (!p) 1931 if (!p)
1900 err = -ENOMEM; 1932 err = -ENOMEM;
1901 } 1933 }
@@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
2133} 2165}
2134 2166
2135static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2167static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
2136 2168
2137static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2169static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2138{ 2170{
2139 struct r5dev *dev = &sh->dev[i]; 2171 struct r5dev *dev = &sh->dev[i];
2140 2172
2141 bio_init(&dev->req); 2173 bio_init(&dev->req);
2142 dev->req.bi_io_vec = &dev->vec; 2174 dev->req.bi_io_vec = &dev->vec;
2143 dev->req.bi_vcnt++; 2175 dev->req.bi_max_vecs = 1;
2144 dev->req.bi_max_vecs++;
2145 dev->req.bi_private = sh; 2176 dev->req.bi_private = sh;
2146 dev->vec.bv_page = dev->page;
2147 2177
2148 bio_init(&dev->rreq); 2178 bio_init(&dev->rreq);
2149 dev->rreq.bi_io_vec = &dev->rvec; 2179 dev->rreq.bi_io_vec = &dev->rvec;
2150 dev->rreq.bi_vcnt++; 2180 dev->rreq.bi_max_vecs = 1;
2151 dev->rreq.bi_max_vecs++;
2152 dev->rreq.bi_private = sh; 2181 dev->rreq.bi_private = sh;
2153 dev->rvec.bv_page = dev->page;
2154 2182
2155 dev->flags = 0; 2183 dev->flags = 0;
2156 dev->sector = compute_blocknr(sh, i, previous); 2184 dev->sector = compute_blocknr(sh, i, previous);
@@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2750 /* and fail all 'written' */ 2778 /* and fail all 'written' */
2751 bi = sh->dev[i].written; 2779 bi = sh->dev[i].written;
2752 sh->dev[i].written = NULL; 2780 sh->dev[i].written = NULL;
2781 if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
2782 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
2783 sh->dev[i].page = sh->dev[i].orig_page;
2784 }
2785
2753 if (bi) bitmap_end = 1; 2786 if (bi) bitmap_end = 1;
2754 while (bi && bi->bi_iter.bi_sector < 2787 while (bi && bi->bi_iter.bi_sector <
2755 sh->dev[i].sector + STRIPE_SECTORS) { 2788 sh->dev[i].sector + STRIPE_SECTORS) {
@@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2886 (s->failed >= 1 && fdev[0]->toread) || 2919 (s->failed >= 1 && fdev[0]->toread) ||
2887 (s->failed >= 2 && fdev[1]->toread) || 2920 (s->failed >= 2 && fdev[1]->toread) ||
2888 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2921 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2922 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
2889 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2923 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2890 (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2924 (sh->raid_conf->level == 6 && s->failed && s->to_write &&
2925 s->to_write < sh->raid_conf->raid_disks - 2 &&
2926 (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
2891 /* we would like to get this block, possibly by computing it, 2927 /* we would like to get this block, possibly by computing it,
2892 * otherwise read it if the backing disk is insync 2928 * otherwise read it if the backing disk is insync
2893 */ 2929 */
@@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2991 dev = &sh->dev[i]; 3027 dev = &sh->dev[i];
2992 if (!test_bit(R5_LOCKED, &dev->flags) && 3028 if (!test_bit(R5_LOCKED, &dev->flags) &&
2993 (test_bit(R5_UPTODATE, &dev->flags) || 3029 (test_bit(R5_UPTODATE, &dev->flags) ||
2994 test_bit(R5_Discard, &dev->flags))) { 3030 test_bit(R5_Discard, &dev->flags) ||
3031 test_bit(R5_SkipCopy, &dev->flags))) {
2995 /* We can return any write requests */ 3032 /* We can return any write requests */
2996 struct bio *wbi, *wbi2; 3033 struct bio *wbi, *wbi2;
2997 pr_debug("Return write for disc %d\n", i); 3034 pr_debug("Return write for disc %d\n", i);
2998 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3035 if (test_and_clear_bit(R5_Discard, &dev->flags))
2999 clear_bit(R5_UPTODATE, &dev->flags); 3036 clear_bit(R5_UPTODATE, &dev->flags);
3037 if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3038 WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3039 dev->page = dev->orig_page;
3040 }
3000 wbi = dev->written; 3041 wbi = dev->written;
3001 dev->written = NULL; 3042 dev->written = NULL;
3002 while (wbi && wbi->bi_iter.bi_sector < 3043 while (wbi && wbi->bi_iter.bi_sector <
@@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
3015 0); 3056 0);
3016 } else if (test_bit(R5_Discard, &dev->flags)) 3057 } else if (test_bit(R5_Discard, &dev->flags))
3017 discard_pending = 1; 3058 discard_pending = 1;
3059 WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
3060 WARN_ON(dev->page != dev->orig_page);
3018 } 3061 }
3019 if (!discard_pending && 3062 if (!discard_pending &&
3020 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { 3063 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3086 !test_bit(R5_LOCKED, &dev->flags) && 3129 !test_bit(R5_LOCKED, &dev->flags) &&
3087 !(test_bit(R5_UPTODATE, &dev->flags) || 3130 !(test_bit(R5_UPTODATE, &dev->flags) ||
3088 test_bit(R5_Wantcompute, &dev->flags))) { 3131 test_bit(R5_Wantcompute, &dev->flags))) {
3089 if (test_bit(R5_Insync, &dev->flags)) rcw++; 3132 if (test_bit(R5_Insync, &dev->flags))
3133 rcw++;
3090 else 3134 else
3091 rcw += 2*disks; 3135 rcw += 2*disks;
3092 } 3136 }
@@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3107 !(test_bit(R5_UPTODATE, &dev->flags) || 3151 !(test_bit(R5_UPTODATE, &dev->flags) ||
3108 test_bit(R5_Wantcompute, &dev->flags)) && 3152 test_bit(R5_Wantcompute, &dev->flags)) &&
3109 test_bit(R5_Insync, &dev->flags)) { 3153 test_bit(R5_Insync, &dev->flags)) {
3110 if ( 3154 if (test_bit(STRIPE_PREREAD_ACTIVE,
3111 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3155 &sh->state)) {
3112 pr_debug("Read_old block " 3156 pr_debug("Read_old block %d for r-m-w\n",
3113 "%d for r-m-w\n", i); 3157 i);
3114 set_bit(R5_LOCKED, &dev->flags); 3158 set_bit(R5_LOCKED, &dev->flags);
3115 set_bit(R5_Wantread, &dev->flags); 3159 set_bit(R5_Wantread, &dev->flags);
3116 s->locked++; 3160 s->locked++;
@@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
3133 !(test_bit(R5_UPTODATE, &dev->flags) || 3177 !(test_bit(R5_UPTODATE, &dev->flags) ||
3134 test_bit(R5_Wantcompute, &dev->flags))) { 3178 test_bit(R5_Wantcompute, &dev->flags))) {
3135 rcw++; 3179 rcw++;
3136 if (!test_bit(R5_Insync, &dev->flags)) 3180 if (test_bit(R5_Insync, &dev->flags) &&
3137 continue; /* it's a failed drive */ 3181 test_bit(STRIPE_PREREAD_ACTIVE,
3138 if ( 3182 &sh->state)) {
3139 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3140 pr_debug("Read_old block " 3183 pr_debug("Read_old block "
3141 "%d for Reconstruct\n", i); 3184 "%d for Reconstruct\n", i);
3142 set_bit(R5_LOCKED, &dev->flags); 3185 set_bit(R5_LOCKED, &dev->flags);
@@ -4370,8 +4413,7 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4370 sh->group = NULL; 4413 sh->group = NULL;
4371 } 4414 }
4372 list_del_init(&sh->lru); 4415 list_del_init(&sh->lru);
4373 atomic_inc(&sh->count); 4416 BUG_ON(atomic_inc_return(&sh->count) != 1);
4374 BUG_ON(atomic_read(&sh->count) != 1);
4375 return sh; 4417 return sh;
4376} 4418}
4377 4419
@@ -4401,7 +4443,7 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4401 * STRIPE_ON_UNPLUG_LIST clear but the stripe 4443 * STRIPE_ON_UNPLUG_LIST clear but the stripe
4402 * is still in our list 4444 * is still in our list
4403 */ 4445 */
4404 smp_mb__before_clear_bit(); 4446 smp_mb__before_atomic();
4405 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); 4447 clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
4406 /* 4448 /*
4407 * STRIPE_ON_RELEASE_LIST could be set here. In that 4449 * STRIPE_ON_RELEASE_LIST could be set here. In that
@@ -5032,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
5032 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5074 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
5033 5075
5034 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5076 set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
5077 set_bit(STRIPE_HANDLE, &sh->state);
5035 5078
5036 handle_stripe(sh);
5037 release_stripe(sh); 5079 release_stripe(sh);
5038 5080
5039 return STRIPE_SECTORS; 5081 return STRIPE_SECTORS;
@@ -5073,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
5073 /* already done this stripe */ 5115 /* already done this stripe */
5074 continue; 5116 continue;
5075 5117
5076 sh = get_active_stripe(conf, sector, 0, 1, 0); 5118 sh = get_active_stripe(conf, sector, 0, 1, 1);
5077 5119
5078 if (!sh) { 5120 if (!sh) {
5079 /* failed to get a stripe - must wait */ 5121 /* failed to get a stripe - must wait */
@@ -5356,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
5356 raid5_store_preread_threshold); 5398 raid5_store_preread_threshold);
5357 5399
5358static ssize_t 5400static ssize_t
5401raid5_show_skip_copy(struct mddev *mddev, char *page)
5402{
5403 struct r5conf *conf = mddev->private;
5404 if (conf)
5405 return sprintf(page, "%d\n", conf->skip_copy);
5406 else
5407 return 0;
5408}
5409
5410static ssize_t
5411raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
5412{
5413 struct r5conf *conf = mddev->private;
5414 unsigned long new;
5415 if (len >= PAGE_SIZE)
5416 return -EINVAL;
5417 if (!conf)
5418 return -ENODEV;
5419
5420 if (kstrtoul(page, 10, &new))
5421 return -EINVAL;
5422 new = !!new;
5423 if (new == conf->skip_copy)
5424 return len;
5425
5426 mddev_suspend(mddev);
5427 conf->skip_copy = new;
5428 if (new)
5429 mddev->queue->backing_dev_info.capabilities |=
5430 BDI_CAP_STABLE_WRITES;
5431 else
5432 mddev->queue->backing_dev_info.capabilities &=
5433 ~BDI_CAP_STABLE_WRITES;
5434 mddev_resume(mddev);
5435 return len;
5436}
5437
5438static struct md_sysfs_entry
5439raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
5440 raid5_show_skip_copy,
5441 raid5_store_skip_copy);
5442
5443
5444static ssize_t
5359stripe_cache_active_show(struct mddev *mddev, char *page) 5445stripe_cache_active_show(struct mddev *mddev, char *page)
5360{ 5446{
5361 struct r5conf *conf = mddev->private; 5447 struct r5conf *conf = mddev->private;
@@ -5440,6 +5526,7 @@ static struct attribute *raid5_attrs[] = {
5440 &raid5_stripecache_active.attr, 5526 &raid5_stripecache_active.attr,
5441 &raid5_preread_bypass_threshold.attr, 5527 &raid5_preread_bypass_threshold.attr,
5442 &raid5_group_thread_cnt.attr, 5528 &raid5_group_thread_cnt.attr,
5529 &raid5_skip_copy.attr,
5443 NULL, 5530 NULL,
5444}; 5531};
5445static struct attribute_group raid5_attrs_group = { 5532static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01ad8ae8f578..bc72cd4be5f8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -232,7 +232,7 @@ struct stripe_head {
232 */ 232 */
233 struct bio req, rreq; 233 struct bio req, rreq;
234 struct bio_vec vec, rvec; 234 struct bio_vec vec, rvec;
235 struct page *page; 235 struct page *page, *orig_page;
236 struct bio *toread, *read, *towrite, *written; 236 struct bio *toread, *read, *towrite, *written;
237 sector_t sector; /* sector of this page */ 237 sector_t sector; /* sector of this page */
238 unsigned long flags; 238 unsigned long flags;
@@ -299,6 +299,7 @@ enum r5dev_flags {
299 * data in, and now is a good time to write it out. 299 * data in, and now is a good time to write it out.
300 */ 300 */
301 R5_Discard, /* Discard the stripe */ 301 R5_Discard, /* Discard the stripe */
302 R5_SkipCopy, /* Don't copy data from bio to stripe cache */
302}; 303};
303 304
304/* 305/*
@@ -436,6 +437,7 @@ struct r5conf {
436 atomic_t pending_full_writes; /* full write backlog */ 437 atomic_t pending_full_writes; /* full write backlog */
437 int bypass_count; /* bypassed prereads */ 438 int bypass_count; /* bypassed prereads */
438 int bypass_threshold; /* preread nice */ 439 int bypass_threshold; /* preread nice */
440 int skip_copy; /* Don't copy data from bio to stripe cache */
439 struct list_head *last_hold; /* detect hold_list promotions */ 441 struct list_head *last_hold; /* detect hold_list promotions */
440 442
441 atomic_t reshape_stripes; /* stripes with pending writes for reshape */ 443 atomic_t reshape_stripes; /* stripes with pending writes for reshape */