aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lightnvm/pblk-write.c
diff options
context:
space:
mode:
authorJavier González <jg@lightnvm.io>2017-04-15 14:55:50 -0400
committerJens Axboe <axboe@fb.com>2017-04-16 12:06:33 -0400
commita4bd217b432685d6a177c28a2af187f041c473b7 (patch)
tree3670d0322655bdef412c415e04c8515e865c1e37 /drivers/lightnvm/pblk-write.c
parent6eb082452df1218e9c0ce1168c456f839ce5acb2 (diff)
lightnvm: physical block device (pblk) target
This patch introduces pblk, a host-side translation layer for Open-Channel SSDs to expose them like block devices. The translation layer allows data placement decisions, and I/O scheduling to be managed by the host, enabling users to optimize the SSD for their specific workloads. An open-channel SSD has a set of LUNs (parallel units) and a collection of blocks. Each block can be read in any order, but writes must be sequential. Writes may also fail, and if a block requires it, must also be reset before new writes can be applied. To manage the constraints, pblk maintains a logical to physical address (L2P) table, write cache, garbage collection logic, recovery scheme, and logic to rate-limit user I/Os versus garbage collection I/Os. The L2P table is fully-associative and manages sectors at a 4KB granularity. Pblk stores the L2P table in two places, in the out-of-band area of the media and on the last page of a line. In the cause of a power failure, pblk will perform a scan to recover the L2P table. The user data is organized into lines. A line is data striped across blocks and LUNs. The lines enable the host to reduce the amount of metadata to maintain besides the user data and makes it easier to implement RAID or erasure coding in the future. pblk implements multi-tenant support and can be instantiated multiple times on the same drive. Each instance owns a portion of the SSD - both regarding I/O bandwidth and capacity - providing I/O isolation for each case. Finally, pblk also exposes a sysfs interface that allows user-space to peek into the internals of pblk. The interface is available at /dev/block/*/pblk/ where * is the block device name exposed. This work also contains contributions from: Matias Bjørling <matias@cnexlabs.com> Simon A. F. Lund <slund@cnexlabs.com> Young Tack Jin <youngtack.jin@gmail.com> Huaicheng Li <huaicheng@cs.uchicago.edu> Signed-off-by: Javier González <javier@cnexlabs.com> Signed-off-by: Matias Bjørling <matias@cnexlabs.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'drivers/lightnvm/pblk-write.c')
-rw-r--r--drivers/lightnvm/pblk-write.c411
1 files changed, 411 insertions, 0 deletions
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 000000000000..ee57db993cd1
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,411 @@
1/*
2 * Copyright (C) 2016 CNEX Labs
3 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
4 * Matias Bjorling <matias@cnexlabs.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License version
8 * 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * pblk-write.c - pblk's write path from write buffer to media
16 */
17
18#include "pblk.h"
19
20static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
21{
22#ifdef CONFIG_NVM_DEBUG
23 atomic_long_inc(&pblk->sync_writes);
24#endif
25
26 /* Counter protected by rb sync lock */
27 line->left_ssecs--;
28 if (!line->left_ssecs)
29 pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
30}
31
32static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
33 struct pblk_c_ctx *c_ctx)
34{
35 struct nvm_tgt_dev *dev = pblk->dev;
36 struct bio *original_bio;
37 unsigned long ret;
38 int i;
39
40 for (i = 0; i < c_ctx->nr_valid; i++) {
41 struct pblk_w_ctx *w_ctx;
42 struct ppa_addr p;
43 struct pblk_line *line;
44
45 w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
46
47 p = rqd->ppa_list[i];
48 line = &pblk->lines[pblk_dev_ppa_to_line(p)];
49 pblk_sync_line(pblk, line);
50
51 while ((original_bio = bio_list_pop(&w_ctx->bios)))
52 bio_endio(original_bio);
53 }
54
55#ifdef CONFIG_NVM_DEBUG
56 atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
57#endif
58
59 ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
60
61 if (rqd->meta_list)
62 nvm_dev_dma_free(dev->parent, rqd->meta_list,
63 rqd->dma_meta_list);
64
65 bio_put(rqd->bio);
66 pblk_free_rqd(pblk, rqd, WRITE);
67
68 return ret;
69}
70
71static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
72 struct nvm_rq *rqd,
73 struct pblk_c_ctx *c_ctx)
74{
75 list_del(&c_ctx->list);
76 return pblk_end_w_bio(pblk, rqd, c_ctx);
77}
78
79static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
80 struct pblk_c_ctx *c_ctx)
81{
82 struct pblk_c_ctx *c, *r;
83 unsigned long flags;
84 unsigned long pos;
85
86#ifdef CONFIG_NVM_DEBUG
87 atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
88#endif
89
90 pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
91
92 pos = pblk_rb_sync_init(&pblk->rwb, &flags);
93 if (pos == c_ctx->sentry) {
94 pos = pblk_end_w_bio(pblk, rqd, c_ctx);
95
96retry:
97 list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
98 rqd = nvm_rq_from_c_ctx(c);
99 if (c->sentry == pos) {
100 pos = pblk_end_queued_w_bio(pblk, rqd, c);
101 goto retry;
102 }
103 }
104 } else {
105 WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
106 list_add_tail(&c_ctx->list, &pblk->compl_list);
107 }
108 pblk_rb_sync_end(&pblk->rwb, &flags);
109}
110
111/* When a write fails, we are not sure whether the block has grown bad or a page
112 * range is more susceptible to write errors. If a high number of pages fail, we
113 * assume that the block is bad and we mark it accordingly. In all cases, we
114 * remap and resubmit the failed entries as fast as possible; if a flush is
115 * waiting on a completion, the whole stack would stall otherwise.
116 */
117static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
118{
119 void *comp_bits = &rqd->ppa_status;
120 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
121 struct pblk_rec_ctx *recovery;
122 struct ppa_addr *ppa_list = rqd->ppa_list;
123 int nr_ppas = rqd->nr_ppas;
124 unsigned int c_entries;
125 int bit, ret;
126
127 if (unlikely(nr_ppas == 1))
128 ppa_list = &rqd->ppa_addr;
129
130 recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
131 if (!recovery) {
132 pr_err("pblk: could not allocate recovery context\n");
133 return;
134 }
135 INIT_LIST_HEAD(&recovery->failed);
136
137 bit = -1;
138 while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
139 struct pblk_rb_entry *entry;
140 struct ppa_addr ppa;
141
142 /* Logic error */
143 if (bit > c_ctx->nr_valid) {
144 WARN_ON_ONCE("pblk: corrupted write request\n");
145 goto out;
146 }
147
148 ppa = ppa_list[bit];
149 entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
150 if (!entry) {
151 pr_err("pblk: could not scan entry on write failure\n");
152 goto out;
153 }
154
155 /* The list is filled first and emptied afterwards. No need for
156 * protecting it with a lock
157 */
158 list_add_tail(&entry->index, &recovery->failed);
159 }
160
161 c_entries = find_first_bit(comp_bits, nr_ppas);
162 ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
163 if (ret) {
164 pr_err("pblk: could not recover from write failure\n");
165 goto out;
166 }
167
168 INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
169 queue_work(pblk->kw_wq, &recovery->ws_rec);
170
171out:
172 pblk_complete_write(pblk, rqd, c_ctx);
173}
174
175static void pblk_end_io_write(struct nvm_rq *rqd)
176{
177 struct pblk *pblk = rqd->private;
178 struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
179
180 if (rqd->error) {
181 pblk_log_write_err(pblk, rqd);
182 return pblk_end_w_fail(pblk, rqd);
183 }
184#ifdef CONFIG_NVM_DEBUG
185 else
186 WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
187#endif
188
189 pblk_complete_write(pblk, rqd, c_ctx);
190}
191
192static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
193 unsigned int nr_secs)
194{
195 struct nvm_tgt_dev *dev = pblk->dev;
196
197 /* Setup write request */
198 rqd->opcode = NVM_OP_PWRITE;
199 rqd->nr_ppas = nr_secs;
200 rqd->flags = pblk_set_progr_mode(pblk, WRITE);
201 rqd->private = pblk;
202 rqd->end_io = pblk_end_io_write;
203
204 rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
205 &rqd->dma_meta_list);
206 if (!rqd->meta_list)
207 return -ENOMEM;
208
209 if (unlikely(nr_secs == 1))
210 return 0;
211
212 rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
213 rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
214
215 return 0;
216}
217
218static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
219 struct pblk_c_ctx *c_ctx)
220{
221 struct pblk_line_meta *lm = &pblk->lm;
222 struct pblk_line *e_line = pblk_line_get_data_next(pblk);
223 struct ppa_addr erase_ppa;
224 unsigned int valid = c_ctx->nr_valid;
225 unsigned int padded = c_ctx->nr_padded;
226 unsigned int nr_secs = valid + padded;
227 unsigned long *lun_bitmap;
228 int ret = 0;
229
230 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
231 if (!lun_bitmap) {
232 ret = -ENOMEM;
233 goto out;
234 }
235 c_ctx->lun_bitmap = lun_bitmap;
236
237 ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
238 if (ret) {
239 kfree(lun_bitmap);
240 goto out;
241 }
242
243 ppa_set_empty(&erase_ppa);
244 if (likely(!e_line || !e_line->left_eblks))
245 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
246 else
247 pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
248 valid, &erase_ppa);
249
250out:
251 if (unlikely(e_line && !ppa_empty(erase_ppa))) {
252 if (pblk_blk_erase_async(pblk, erase_ppa)) {
253 struct nvm_tgt_dev *dev = pblk->dev;
254 struct nvm_geo *geo = &dev->geo;
255 int bit;
256
257 e_line->left_eblks++;
258 bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
259 WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
260 up(&pblk->erase_sem);
261 }
262 }
263
264 return ret;
265}
266
267int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
268 struct pblk_c_ctx *c_ctx)
269{
270 struct pblk_line_meta *lm = &pblk->lm;
271 unsigned long *lun_bitmap;
272 int ret;
273
274 lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
275 if (!lun_bitmap)
276 return -ENOMEM;
277
278 c_ctx->lun_bitmap = lun_bitmap;
279
280 ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
281 if (ret)
282 return ret;
283
284 pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
285
286 rqd->ppa_status = (u64)0;
287 rqd->flags = pblk_set_progr_mode(pblk, WRITE);
288
289 return ret;
290}
291
292static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
293 unsigned int secs_to_flush)
294{
295 int secs_to_sync;
296
297 secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
298
299#ifdef CONFIG_NVM_DEBUG
300 if ((!secs_to_sync && secs_to_flush)
301 || (secs_to_sync < 0)
302 || (secs_to_sync > secs_avail && !secs_to_flush)) {
303 pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
304 secs_avail, secs_to_sync, secs_to_flush);
305 }
306#endif
307
308 return secs_to_sync;
309}
310
311static int pblk_submit_write(struct pblk *pblk)
312{
313 struct bio *bio;
314 struct nvm_rq *rqd;
315 struct pblk_c_ctx *c_ctx;
316 unsigned int pgs_read;
317 unsigned int secs_avail, secs_to_sync, secs_to_com;
318 unsigned int secs_to_flush;
319 unsigned long pos;
320 int err;
321
322 /* If there are no sectors in the cache, flushes (bios without data)
323 * will be cleared on the cache threads
324 */
325 secs_avail = pblk_rb_read_count(&pblk->rwb);
326 if (!secs_avail)
327 return 1;
328
329 secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
330 if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
331 return 1;
332
333 rqd = pblk_alloc_rqd(pblk, WRITE);
334 if (IS_ERR(rqd)) {
335 pr_err("pblk: cannot allocate write req.\n");
336 return 1;
337 }
338 c_ctx = nvm_rq_to_pdu(rqd);
339
340 bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
341 if (!bio) {
342 pr_err("pblk: cannot allocate write bio\n");
343 goto fail_free_rqd;
344 }
345 bio->bi_iter.bi_sector = 0; /* internal bio */
346 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
347 rqd->bio = bio;
348
349 secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
350 if (secs_to_sync > pblk->max_write_pgs) {
351 pr_err("pblk: bad buffer sync calculation\n");
352 goto fail_put_bio;
353 }
354
355 secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
356 pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
357
358 pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
359 secs_to_sync, secs_avail);
360 if (!pgs_read) {
361 pr_err("pblk: corrupted write bio\n");
362 goto fail_put_bio;
363 }
364
365 if (c_ctx->nr_padded)
366 if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
367 goto fail_put_bio;
368
369 /* Assign lbas to ppas and populate request structure */
370 err = pblk_setup_w_rq(pblk, rqd, c_ctx);
371 if (err) {
372 pr_err("pblk: could not setup write request\n");
373 goto fail_free_bio;
374 }
375
376 err = pblk_submit_io(pblk, rqd);
377 if (err) {
378 pr_err("pblk: I/O submission failed: %d\n", err);
379 goto fail_free_bio;
380 }
381
382#ifdef CONFIG_NVM_DEBUG
383 atomic_long_add(secs_to_sync, &pblk->sub_writes);
384#endif
385
386 return 0;
387
388fail_free_bio:
389 if (c_ctx->nr_padded)
390 pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
391fail_put_bio:
392 bio_put(bio);
393fail_free_rqd:
394 pblk_free_rqd(pblk, rqd, WRITE);
395
396 return 1;
397}
398
399int pblk_write_ts(void *data)
400{
401 struct pblk *pblk = data;
402
403 while (!kthread_should_stop()) {
404 if (!pblk_submit_write(pblk))
405 continue;
406 set_current_state(TASK_INTERRUPTIBLE);
407 io_schedule();
408 }
409
410 return 0;
411}