summaryrefslogtreecommitdiffstats
path: root/drivers/lightnvm/pblk-gc.c
diff options
context:
space:
mode:
authorJavier González <jg@lightnvm.io>2017-04-15 14:55:50 -0400
committerJens Axboe <axboe@fb.com>2017-04-16 12:06:33 -0400
commita4bd217b432685d6a177c28a2af187f041c473b7 (patch)
tree3670d0322655bdef412c415e04c8515e865c1e37 /drivers/lightnvm/pblk-gc.c
parent6eb082452df1218e9c0ce1168c456f839ce5acb2 (diff)
lightnvm: physical block device (pblk) target
This patch introduces pblk, a host-side translation layer for Open-Channel SSDs to expose them like block devices. The translation layer allows data placement decisions, and I/O scheduling to be managed by the host, enabling users to optimize the SSD for their specific workloads. An open-channel SSD has a set of LUNs (parallel units) and a collection of blocks. Each block can be read in any order, but writes must be sequential. Writes may also fail, and if a block requires it, must also be reset before new writes can be applied. To manage the constraints, pblk maintains a logical to physical address (L2P) table, write cache, garbage collection logic, recovery scheme, and logic to rate-limit user I/Os versus garbage collection I/Os. The L2P table is fully-associative and manages sectors at a 4KB granularity. Pblk stores the L2P table in two places, in the out-of-band area of the media and on the last page of a line. In the cause of a power failure, pblk will perform a scan to recover the L2P table. The user data is organized into lines. A line is data striped across blocks and LUNs. The lines enable the host to reduce the amount of metadata to maintain besides the user data and makes it easier to implement RAID or erasure coding in the future. pblk implements multi-tenant support and can be instantiated multiple times on the same drive. Each instance owns a portion of the SSD - both regarding I/O bandwidth and capacity - providing I/O isolation for each case. Finally, pblk also exposes a sysfs interface that allows user-space to peek into the internals of pblk. The interface is available at /dev/block/*/pblk/ where * is the block device name exposed. This work also contains contributions from: Matias Bjørling <matias@cnexlabs.com> Simon A. F. Lund <slund@cnexlabs.com> Young Tack Jin <youngtack.jin@gmail.com> Huaicheng Li <huaicheng@cs.uchicago.edu> Signed-off-by: Javier González <javier@cnexlabs.com> Signed-off-by: Matias Bjørling <matias@cnexlabs.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'drivers/lightnvm/pblk-gc.c')
-rw-r--r--drivers/lightnvm/pblk-gc.c555
1 files changed, 555 insertions, 0 deletions
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 000000000000..9b147cfd8a41
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,555 @@
1/*
2 * Copyright (C) 2016 CNEX Labs
3 * Initial release: Javier Gonzalez <javier@cnexlabs.com>
4 * Matias Bjorling <matias@cnexlabs.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License version
8 * 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * pblk-gc.c - pblk's garbage collector
16 */
17
18#include "pblk.h"
19#include <linux/delay.h>
20
21static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
22{
23 kfree(gc_rq->data);
24 kfree(gc_rq->lba_list);
25 kfree(gc_rq);
26}
27
28static int pblk_gc_write(struct pblk *pblk)
29{
30 struct pblk_gc *gc = &pblk->gc;
31 struct pblk_gc_rq *gc_rq, *tgc_rq;
32 LIST_HEAD(w_list);
33
34 spin_lock(&gc->w_lock);
35 if (list_empty(&gc->w_list)) {
36 spin_unlock(&gc->w_lock);
37 return 1;
38 }
39
40 list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
41 list_move_tail(&gc_rq->list, &w_list);
42 gc->w_entries--;
43 }
44 spin_unlock(&gc->w_lock);
45
46 list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
47 pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
48 gc_rq->nr_secs, gc_rq->secs_to_gc,
49 gc_rq->line, PBLK_IOTYPE_GC);
50
51 kref_put(&gc_rq->line->ref, pblk_line_put);
52
53 list_del(&gc_rq->list);
54 pblk_gc_free_gc_rq(gc_rq);
55 }
56
57 return 0;
58}
59
60static void pblk_gc_writer_kick(struct pblk_gc *gc)
61{
62 wake_up_process(gc->gc_writer_ts);
63}
64
65/*
66 * Responsible for managing all memory related to a gc request. Also in case of
67 * failure
68 */
69static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
70 u64 *lba_list, unsigned int nr_secs)
71{
72 struct nvm_tgt_dev *dev = pblk->dev;
73 struct nvm_geo *geo = &dev->geo;
74 struct pblk_gc *gc = &pblk->gc;
75 struct pblk_gc_rq *gc_rq;
76 void *data;
77 unsigned int secs_to_gc;
78 int ret = NVM_IO_OK;
79
80 data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
81 if (!data) {
82 ret = NVM_IO_ERR;
83 goto free_lba_list;
84 }
85
86 /* Read from GC victim block */
87 if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
88 &secs_to_gc, line)) {
89 ret = NVM_IO_ERR;
90 goto free_data;
91 }
92
93 if (!secs_to_gc)
94 goto free_data;
95
96 gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
97 if (!gc_rq) {
98 ret = NVM_IO_ERR;
99 goto free_data;
100 }
101
102 gc_rq->line = line;
103 gc_rq->data = data;
104 gc_rq->lba_list = lba_list;
105 gc_rq->nr_secs = nr_secs;
106 gc_rq->secs_to_gc = secs_to_gc;
107
108 kref_get(&line->ref);
109
110retry:
111 spin_lock(&gc->w_lock);
112 if (gc->w_entries > 256) {
113 spin_unlock(&gc->w_lock);
114 usleep_range(256, 1024);
115 goto retry;
116 }
117 gc->w_entries++;
118 list_add_tail(&gc_rq->list, &gc->w_list);
119 spin_unlock(&gc->w_lock);
120
121 pblk_gc_writer_kick(&pblk->gc);
122
123 return NVM_IO_OK;
124
125free_data:
126 kfree(data);
127free_lba_list:
128 kfree(lba_list);
129
130 return ret;
131}
132
133static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
134{
135 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
136 struct list_head *move_list;
137
138 spin_lock(&line->lock);
139 WARN_ON(line->state != PBLK_LINESTATE_GC);
140 line->state = PBLK_LINESTATE_CLOSED;
141 move_list = pblk_line_gc_list(pblk, line);
142 spin_unlock(&line->lock);
143
144 if (move_list) {
145 spin_lock(&l_mg->gc_lock);
146 list_add_tail(&line->list, move_list);
147 spin_unlock(&l_mg->gc_lock);
148 }
149}
150
151static void pblk_gc_line_ws(struct work_struct *work)
152{
153 struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
154 ws);
155 struct pblk *pblk = line_ws->pblk;
156 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
157 struct pblk_line *line = line_ws->line;
158 struct pblk_line_meta *lm = &pblk->lm;
159 __le64 *lba_list = line_ws->priv;
160 u64 *gc_list;
161 int sec_left;
162 int nr_ppas, bit;
163 int put_line = 1;
164
165 pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
166
167 spin_lock(&line->lock);
168 sec_left = line->vsc;
169 if (!sec_left) {
170 /* Lines are erased before being used (l_mg->data_/log_next) */
171 spin_unlock(&line->lock);
172 goto out;
173 }
174 spin_unlock(&line->lock);
175
176 if (sec_left < 0) {
177 pr_err("pblk: corrupted GC line (%d)\n", line->id);
178 put_line = 0;
179 pblk_put_line_back(pblk, line);
180 goto out;
181 }
182
183 bit = -1;
184next_rq:
185 gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
186 if (!gc_list) {
187 put_line = 0;
188 pblk_put_line_back(pblk, line);
189 goto out;
190 }
191
192 nr_ppas = 0;
193 do {
194 bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
195 bit + 1);
196 if (bit > line->emeta_ssec)
197 break;
198
199 gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
200 } while (nr_ppas < pblk->max_write_pgs);
201
202 if (unlikely(!nr_ppas)) {
203 kfree(gc_list);
204 goto out;
205 }
206
207 if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
208 pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
209 line->id, line->vsc,
210 nr_ppas, nr_ppas);
211 put_line = 0;
212 pblk_put_line_back(pblk, line);
213 goto out;
214 }
215
216 sec_left -= nr_ppas;
217 if (sec_left > 0)
218 goto next_rq;
219
220out:
221 pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
222 mempool_free(line_ws, pblk->line_ws_pool);
223 atomic_dec(&pblk->gc.inflight_gc);
224 if (put_line)
225 kref_put(&line->ref, pblk_line_put);
226}
227
228static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
229{
230 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
231 struct pblk_line_meta *lm = &pblk->lm;
232 struct pblk_line_ws *line_ws;
233 __le64 *lba_list;
234 int ret;
235
236 line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
237 line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
238 GFP_KERNEL);
239 if (!line->emeta) {
240 pr_err("pblk: cannot use GC emeta\n");
241 goto fail_free_ws;
242 }
243
244 ret = pblk_line_read_emeta(pblk, line);
245 if (ret) {
246 pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
247 goto fail_free_emeta;
248 }
249
250 /* If this read fails, it means that emeta is corrupted. For now, leave
251 * the line untouched. TODO: Implement a recovery routine that scans and
252 * moves all sectors on the line.
253 */
254 lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
255 if (!lba_list) {
256 pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
257 goto fail_free_emeta;
258 }
259
260 line_ws->pblk = pblk;
261 line_ws->line = line;
262 line_ws->priv = lba_list;
263
264 INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
265 queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
266
267 return 0;
268
269fail_free_emeta:
270 pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
271fail_free_ws:
272 mempool_free(line_ws, pblk->line_ws_pool);
273 pblk_put_line_back(pblk, line);
274
275 return 1;
276}
277
278static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
279{
280 struct pblk_line *line, *tline;
281
282 list_for_each_entry_safe(line, tline, gc_list, list) {
283 if (pblk_gc_line(pblk, line))
284 pr_err("pblk: failed to GC line %d\n", line->id);
285 list_del(&line->list);
286 }
287}
288
289/*
290 * Lines with no valid sectors will be returned to the free list immediately. If
291 * GC is activated - either because the free block count is under the determined
292 * threshold, or because it is being forced from user space - only lines with a
293 * high count of invalid sectors will be recycled.
294 */
295static void pblk_gc_run(struct pblk *pblk)
296{
297 struct pblk_line_mgmt *l_mg = &pblk->l_mg;
298 struct pblk_gc *gc = &pblk->gc;
299 struct pblk_line *line, *tline;
300 unsigned int nr_blocks_free, nr_blocks_need;
301 struct list_head *group_list;
302 int run_gc, gc_group = 0;
303 int prev_gc = 0;
304 int inflight_gc = atomic_read(&gc->inflight_gc);
305 LIST_HEAD(gc_list);
306
307 spin_lock(&l_mg->gc_lock);
308 list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
309 spin_lock(&line->lock);
310 WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
311 line->state = PBLK_LINESTATE_GC;
312 spin_unlock(&line->lock);
313
314 list_del(&line->list);
315 kref_put(&line->ref, pblk_line_put);
316 }
317 spin_unlock(&l_mg->gc_lock);
318
319 nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
320 nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
321 run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
322
323next_gc_group:
324 group_list = l_mg->gc_lists[gc_group++];
325 spin_lock(&l_mg->gc_lock);
326 while (run_gc && !list_empty(group_list)) {
327 /* No need to queue up more GC lines than we can handle */
328 if (!run_gc || inflight_gc > gc->gc_jobs_active) {
329 spin_unlock(&l_mg->gc_lock);
330 pblk_gc_lines(pblk, &gc_list);
331 return;
332 }
333
334 line = list_first_entry(group_list, struct pblk_line, list);
335 nr_blocks_free += line->blk_in_line;
336
337 spin_lock(&line->lock);
338 WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
339 line->state = PBLK_LINESTATE_GC;
340 list_move_tail(&line->list, &gc_list);
341 atomic_inc(&gc->inflight_gc);
342 inflight_gc++;
343 spin_unlock(&line->lock);
344
345 prev_gc = 1;
346 run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
347 }
348 spin_unlock(&l_mg->gc_lock);
349
350 pblk_gc_lines(pblk, &gc_list);
351
352 if (!prev_gc && pblk->rl.rb_state > gc_group &&
353 gc_group < PBLK_NR_GC_LISTS)
354 goto next_gc_group;
355}
356
357
358static void pblk_gc_kick(struct pblk *pblk)
359{
360 struct pblk_gc *gc = &pblk->gc;
361
362 wake_up_process(gc->gc_ts);
363 pblk_gc_writer_kick(gc);
364 mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
365}
366
367static void pblk_gc_timer(unsigned long data)
368{
369 struct pblk *pblk = (struct pblk *)data;
370
371 pblk_gc_kick(pblk);
372}
373
374static int pblk_gc_ts(void *data)
375{
376 struct pblk *pblk = data;
377
378 while (!kthread_should_stop()) {
379 pblk_gc_run(pblk);
380 set_current_state(TASK_INTERRUPTIBLE);
381 io_schedule();
382 }
383
384 return 0;
385}
386
387static int pblk_gc_writer_ts(void *data)
388{
389 struct pblk *pblk = data;
390
391 while (!kthread_should_stop()) {
392 if (!pblk_gc_write(pblk))
393 continue;
394 set_current_state(TASK_INTERRUPTIBLE);
395 io_schedule();
396 }
397
398 return 0;
399}
400
401static void pblk_gc_start(struct pblk *pblk)
402{
403 pblk->gc.gc_active = 1;
404
405 pr_debug("pblk: gc start\n");
406}
407
408int pblk_gc_status(struct pblk *pblk)
409{
410 struct pblk_gc *gc = &pblk->gc;
411 int ret;
412
413 spin_lock(&gc->lock);
414 ret = gc->gc_active;
415 spin_unlock(&gc->lock);
416
417 return ret;
418}
419
420static void __pblk_gc_should_start(struct pblk *pblk)
421{
422 struct pblk_gc *gc = &pblk->gc;
423
424 lockdep_assert_held(&gc->lock);
425
426 if (gc->gc_enabled && !gc->gc_active)
427 pblk_gc_start(pblk);
428}
429
430void pblk_gc_should_start(struct pblk *pblk)
431{
432 struct pblk_gc *gc = &pblk->gc;
433
434 spin_lock(&gc->lock);
435 __pblk_gc_should_start(pblk);
436 spin_unlock(&gc->lock);
437}
438
439/*
440 * If flush_wq == 1 then no lock should be held by the caller since
441 * flush_workqueue can sleep
442 */
443static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
444{
445 spin_lock(&pblk->gc.lock);
446 pblk->gc.gc_active = 0;
447 spin_unlock(&pblk->gc.lock);
448
449 pr_debug("pblk: gc stop\n");
450}
451
452void pblk_gc_should_stop(struct pblk *pblk)
453{
454 struct pblk_gc *gc = &pblk->gc;
455
456 if (gc->gc_active && !gc->gc_forced)
457 pblk_gc_stop(pblk, 0);
458}
459
460void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
461 int *gc_active)
462{
463 struct pblk_gc *gc = &pblk->gc;
464
465 spin_lock(&gc->lock);
466 *gc_enabled = gc->gc_enabled;
467 *gc_active = gc->gc_active;
468 spin_unlock(&gc->lock);
469}
470
471void pblk_gc_sysfs_force(struct pblk *pblk, int force)
472{
473 struct pblk_gc *gc = &pblk->gc;
474 int rsv = 0;
475
476 spin_lock(&gc->lock);
477 if (force) {
478 gc->gc_enabled = 1;
479 rsv = 64;
480 }
481 pblk_rl_set_gc_rsc(&pblk->rl, rsv);
482 gc->gc_forced = force;
483 __pblk_gc_should_start(pblk);
484 spin_unlock(&gc->lock);
485}
486
487int pblk_gc_init(struct pblk *pblk)
488{
489 struct pblk_gc *gc = &pblk->gc;
490 int ret;
491
492 gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
493 if (IS_ERR(gc->gc_ts)) {
494 pr_err("pblk: could not allocate GC main kthread\n");
495 return PTR_ERR(gc->gc_ts);
496 }
497
498 gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
499 "pblk-gc-writer-ts");
500 if (IS_ERR(gc->gc_writer_ts)) {
501 pr_err("pblk: could not allocate GC writer kthread\n");
502 ret = PTR_ERR(gc->gc_writer_ts);
503 goto fail_free_main_kthread;
504 }
505
506 setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
507 mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
508
509 gc->gc_active = 0;
510 gc->gc_forced = 0;
511 gc->gc_enabled = 1;
512 gc->gc_jobs_active = 8;
513 gc->w_entries = 0;
514 atomic_set(&gc->inflight_gc, 0);
515
516 gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
517 WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
518 if (!gc->gc_reader_wq) {
519 pr_err("pblk: could not allocate GC reader workqueue\n");
520 ret = -ENOMEM;
521 goto fail_free_writer_kthread;
522 }
523
524 spin_lock_init(&gc->lock);
525 spin_lock_init(&gc->w_lock);
526 INIT_LIST_HEAD(&gc->w_list);
527
528 return 0;
529
530fail_free_main_kthread:
531 kthread_stop(gc->gc_ts);
532fail_free_writer_kthread:
533 kthread_stop(gc->gc_writer_ts);
534
535 return ret;
536}
537
538void pblk_gc_exit(struct pblk *pblk)
539{
540 struct pblk_gc *gc = &pblk->gc;
541
542 flush_workqueue(gc->gc_reader_wq);
543
544 del_timer(&gc->gc_timer);
545 pblk_gc_stop(pblk, 1);
546
547 if (gc->gc_ts)
548 kthread_stop(gc->gc_ts);
549
550 if (pblk->gc.gc_reader_wq)
551 destroy_workqueue(pblk->gc.gc_reader_wq);
552
553 if (gc->gc_writer_ts)
554 kthread_stop(gc->gc_writer_ts);
555}