summaryrefslogtreecommitdiffstats
path: root/drivers/target
diff options
context:
space:
mode:
authorXiubo Li <lixiubo@cmss.chinamobile.com>2017-05-01 23:38:06 -0400
committerNicholas Bellinger <nab@linux-iscsi.org>2017-05-02 01:58:04 -0400
commitb6df4b79a5514a9c6c53533436704129ef45bf76 (patch)
treeb7b6e9fa76ba07f85759c720282693c6f9e46279 /drivers/target
parent141685a39151aea95eb56562d2953e919c6c73da (diff)
tcmu: Add global data block pool support
For each target there will be one ring, when the target number grows larger and larger, it could eventually runs out of the system memories. In this patch for each target ring, currently for the cmd area the size will be fixed to 8MB and for the data area the size will grow from 0 to max 256K * PAGE_SIZE(1G for 4K page size). For all the targets' data areas, they will get empty blocks from the "global data block pool", which has limited to 512K * PAGE_SIZE(2G for 4K page size) for now. When the "global data block pool" has been used up, then any target could wake up the unmap thread routine to shrink other targets' data area memories. And the unmap thread routine will always try to truncate the ring vma from the last using block offset. When user space has touched the data blocks out of tcmu_cmd iov[], the tcmu_page_fault() will try to return one zeroed blocks. Here we move the timeout's tcmu_handle_completions() into unmap thread routine, that's to say when the timeout fired, it will only do the tcmu_check_expired_cmd() and then wake up the unmap thread to do the completions() and then try to shrink its idle memories. Then the cmdr_lock could be a mutex and could simplify this patch because the unmap_mapping_range() or zap_* may go to sleep. Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com> Signed-off-by: Jianfei Hu <hujianfei@cmss.chinamobile.com> Acked-by: Mike Christie <mchristi@redhat.com> Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Diffstat (limited to 'drivers/target')
-rw-r--r--drivers/target/target_core_user.c464
1 files changed, 336 insertions, 128 deletions
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 02cf543a888d..0b29e4f00bce 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -31,6 +31,8 @@
31#include <linux/bitops.h> 31#include <linux/bitops.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/configfs.h> 33#include <linux/configfs.h>
34#include <linux/mutex.h>
35#include <linux/kthread.h>
34#include <net/genetlink.h> 36#include <net/genetlink.h>
35#include <scsi/scsi_common.h> 37#include <scsi/scsi_common.h>
36#include <scsi/scsi_proto.h> 38#include <scsi/scsi_proto.h>
@@ -67,17 +69,24 @@
67 69
68#define TCMU_TIME_OUT (30 * MSEC_PER_SEC) 70#define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
69 71
70/* For cmd area, the size is fixed 2M */ 72/* For cmd area, the size is fixed 8MB */
71#define CMDR_SIZE (2 * 1024 * 1024) 73#define CMDR_SIZE (8 * 1024 * 1024)
72 74
73/* For data area, the size is fixed 32M */ 75/*
74#define DATA_BLOCK_BITS (8 * 1024) 76 * For data area, the block size is PAGE_SIZE and
75#define DATA_BLOCK_SIZE 4096 77 * the total size is 256K * PAGE_SIZE.
78 */
79#define DATA_BLOCK_SIZE PAGE_SIZE
80#define DATA_BLOCK_BITS (256 * 1024)
76#define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE) 81#define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
82#define DATA_BLOCK_INIT_BITS 128
77 83
78/* The ring buffer size is 34M */ 84/* The total size of the ring is 8M + 256K * PAGE_SIZE */
79#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) 85#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
80 86
87/* Default maximum of the global data blocks(512K * PAGE_SIZE) */
88#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
89
81static struct device *tcmu_root_device; 90static struct device *tcmu_root_device;
82 91
83struct tcmu_hba { 92struct tcmu_hba {
@@ -87,6 +96,8 @@ struct tcmu_hba {
87#define TCMU_CONFIG_LEN 256 96#define TCMU_CONFIG_LEN 256
88 97
89struct tcmu_dev { 98struct tcmu_dev {
99 struct list_head node;
100
90 struct se_device se_dev; 101 struct se_device se_dev;
91 102
92 char *name; 103 char *name;
@@ -98,6 +109,8 @@ struct tcmu_dev {
98 109
99 struct uio_info uio_info; 110 struct uio_info uio_info;
100 111
112 struct inode *inode;
113
101 struct tcmu_mailbox *mb_addr; 114 struct tcmu_mailbox *mb_addr;
102 size_t dev_size; 115 size_t dev_size;
103 u32 cmdr_size; 116 u32 cmdr_size;
@@ -108,10 +121,11 @@ struct tcmu_dev {
108 size_t data_size; 121 size_t data_size;
109 122
110 wait_queue_head_t wait_cmdr; 123 wait_queue_head_t wait_cmdr;
111 /* TODO should this be a mutex? */ 124 struct mutex cmdr_lock;
112 spinlock_t cmdr_lock;
113 125
126 bool waiting_global;
114 uint32_t dbi_max; 127 uint32_t dbi_max;
128 uint32_t dbi_thresh;
115 DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS); 129 DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
116 struct radix_tree_root data_blocks; 130 struct radix_tree_root data_blocks;
117 131
@@ -146,6 +160,13 @@ struct tcmu_cmd {
146 unsigned long flags; 160 unsigned long flags;
147}; 161};
148 162
163static struct task_struct *unmap_thread;
164static wait_queue_head_t unmap_wait;
165static DEFINE_MUTEX(root_udev_mutex);
166static LIST_HEAD(root_udev);
167
168static atomic_t global_db_count = ATOMIC_INIT(0);
169
149static struct kmem_cache *tcmu_cmd_cache; 170static struct kmem_cache *tcmu_cmd_cache;
150 171
151/* multicast group */ 172/* multicast group */
@@ -174,48 +195,78 @@ static struct genl_family tcmu_genl_family __ro_after_init = {
174#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index)) 195#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
175#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++]) 196#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
176 197
177static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd) 198static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len)
178{ 199{
179 struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; 200 struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
180 uint32_t i; 201 uint32_t i;
181 202
182 for (i = 0; i < tcmu_cmd->dbi_cnt; i++) 203 for (i = 0; i < len; i++)
183 clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap); 204 clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap);
184} 205}
185 206
186static int tcmu_get_empty_block(struct tcmu_dev *udev, void **addr) 207static inline bool tcmu_get_empty_block(struct tcmu_dev *udev,
208 struct tcmu_cmd *tcmu_cmd)
187{ 209{
188 void *p; 210 struct page *page;
189 uint32_t dbi; 211 int ret, dbi;
190 int ret;
191 212
192 dbi = find_first_zero_bit(udev->data_bitmap, DATA_BLOCK_BITS); 213 dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh);
193 if (dbi > udev->dbi_max) 214 if (dbi == udev->dbi_thresh)
194 udev->dbi_max = dbi; 215 return false;
195 216
196 set_bit(dbi, udev->data_bitmap); 217 page = radix_tree_lookup(&udev->data_blocks, dbi);
218 if (!page) {
197 219
198 p = radix_tree_lookup(&udev->data_blocks, dbi); 220 if (atomic_add_return(1, &global_db_count) >
199 if (!p) { 221 TCMU_GLOBAL_MAX_BLOCKS) {
200 p = kzalloc(DATA_BLOCK_SIZE, GFP_ATOMIC); 222 atomic_dec(&global_db_count);
201 if (!p) { 223 return false;
202 clear_bit(dbi, udev->data_bitmap);
203 return -ENOMEM;
204 } 224 }
205 225
206 ret = radix_tree_insert(&udev->data_blocks, dbi, p); 226 /* try to get new page from the mm */
227 page = alloc_page(GFP_KERNEL);
228 if (!page)
229 return false;
230
231 ret = radix_tree_insert(&udev->data_blocks, dbi, page);
207 if (ret) { 232 if (ret) {
208 kfree(p); 233 __free_page(page);
209 clear_bit(dbi, udev->data_bitmap); 234 return false;
210 return ret;
211 } 235 }
236
212 } 237 }
213 238
214 *addr = p; 239 if (dbi > udev->dbi_max)
215 return dbi; 240 udev->dbi_max = dbi;
241
242 set_bit(dbi, udev->data_bitmap);
243 tcmu_cmd_set_dbi(tcmu_cmd, dbi);
244
245 return true;
216} 246}
217 247
218static void *tcmu_get_block_addr(struct tcmu_dev *udev, uint32_t dbi) 248static bool tcmu_get_empty_blocks(struct tcmu_dev *udev,
249 struct tcmu_cmd *tcmu_cmd)
250{
251 int i;
252
253 udev->waiting_global = false;
254
255 for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) {
256 if (!tcmu_get_empty_block(udev, tcmu_cmd))
257 goto err;
258 }
259 return true;
260
261err:
262 udev->waiting_global = true;
263 /* Try to wake up the unmap thread */
264 wake_up(&unmap_wait);
265 return false;
266}
267
268static inline struct page *
269tcmu_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
219{ 270{
220 return radix_tree_lookup(&udev->data_blocks, dbi); 271 return radix_tree_lookup(&udev->data_blocks, dbi);
221} 272}
@@ -355,7 +406,7 @@ static inline size_t iov_tail(struct tcmu_dev *udev, struct iovec *iov)
355 return (size_t)iov->iov_base + iov->iov_len; 406 return (size_t)iov->iov_base + iov->iov_len;
356} 407}
357 408
358static int alloc_and_scatter_data_area(struct tcmu_dev *udev, 409static int scatter_data_area(struct tcmu_dev *udev,
359 struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg, 410 struct tcmu_cmd *tcmu_cmd, struct scatterlist *data_sg,
360 unsigned int data_nents, struct iovec **iov, 411 unsigned int data_nents, struct iovec **iov,
361 int *iov_cnt, bool copy_data) 412 int *iov_cnt, bool copy_data)
@@ -365,19 +416,20 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
365 void *from, *to = NULL; 416 void *from, *to = NULL;
366 size_t copy_bytes, to_offset, offset; 417 size_t copy_bytes, to_offset, offset;
367 struct scatterlist *sg; 418 struct scatterlist *sg;
419 struct page *page;
368 420
369 for_each_sg(data_sg, sg, data_nents, i) { 421 for_each_sg(data_sg, sg, data_nents, i) {
370 int sg_remaining = sg->length; 422 int sg_remaining = sg->length;
371 from = kmap_atomic(sg_page(sg)) + sg->offset; 423 from = kmap_atomic(sg_page(sg)) + sg->offset;
372 while (sg_remaining > 0) { 424 while (sg_remaining > 0) {
373 if (block_remaining == 0) { 425 if (block_remaining == 0) {
426 if (to)
427 kunmap_atomic(to);
428
374 block_remaining = DATA_BLOCK_SIZE; 429 block_remaining = DATA_BLOCK_SIZE;
375 dbi = tcmu_get_empty_block(udev, &to); 430 dbi = tcmu_cmd_get_dbi(tcmu_cmd);
376 if (dbi < 0) { 431 page = tcmu_get_block_page(udev, dbi);
377 kunmap_atomic(from - sg->offset); 432 to = kmap_atomic(page);
378 return dbi;
379 }
380 tcmu_cmd_set_dbi(tcmu_cmd, dbi);
381 } 433 }
382 434
383 copy_bytes = min_t(size_t, sg_remaining, 435 copy_bytes = min_t(size_t, sg_remaining,
@@ -405,6 +457,8 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
405 } 457 }
406 kunmap_atomic(from - sg->offset); 458 kunmap_atomic(from - sg->offset);
407 } 459 }
460 if (to)
461 kunmap_atomic(to);
408 462
409 return 0; 463 return 0;
410} 464}
@@ -415,9 +469,10 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
415 struct se_cmd *se_cmd = cmd->se_cmd; 469 struct se_cmd *se_cmd = cmd->se_cmd;
416 int i, dbi; 470 int i, dbi;
417 int block_remaining = 0; 471 int block_remaining = 0;
418 void *from, *to; 472 void *from = NULL, *to;
419 size_t copy_bytes, offset; 473 size_t copy_bytes, offset;
420 struct scatterlist *sg, *data_sg; 474 struct scatterlist *sg, *data_sg;
475 struct page *page;
421 unsigned int data_nents; 476 unsigned int data_nents;
422 uint32_t count = 0; 477 uint32_t count = 0;
423 478
@@ -444,9 +499,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
444 to = kmap_atomic(sg_page(sg)) + sg->offset; 499 to = kmap_atomic(sg_page(sg)) + sg->offset;
445 while (sg_remaining > 0) { 500 while (sg_remaining > 0) {
446 if (block_remaining == 0) { 501 if (block_remaining == 0) {
502 if (from)
503 kunmap_atomic(from);
504
447 block_remaining = DATA_BLOCK_SIZE; 505 block_remaining = DATA_BLOCK_SIZE;
448 dbi = tcmu_cmd_get_dbi(cmd); 506 dbi = tcmu_cmd_get_dbi(cmd);
449 from = tcmu_get_block_addr(udev, dbi); 507 page = tcmu_get_block_page(udev, dbi);
508 from = kmap_atomic(page);
450 } 509 }
451 copy_bytes = min_t(size_t, sg_remaining, 510 copy_bytes = min_t(size_t, sg_remaining,
452 block_remaining); 511 block_remaining);
@@ -461,12 +520,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
461 } 520 }
462 kunmap_atomic(to - sg->offset); 521 kunmap_atomic(to - sg->offset);
463 } 522 }
523 if (from)
524 kunmap_atomic(from);
464} 525}
465 526
466static inline size_t spc_bitmap_free(unsigned long *bitmap) 527static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
467{ 528{
468 return DATA_BLOCK_SIZE * (DATA_BLOCK_BITS - 529 return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh));
469 bitmap_weight(bitmap, DATA_BLOCK_BITS));
470} 530}
471 531
472/* 532/*
@@ -475,9 +535,12 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap)
475 * 535 *
476 * Called with ring lock held. 536 * Called with ring lock held.
477 */ 537 */
478static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed) 538static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
539 size_t cmd_size, size_t data_needed)
479{ 540{
480 struct tcmu_mailbox *mb = udev->mb_addr; 541 struct tcmu_mailbox *mb = udev->mb_addr;
542 uint32_t blocks_needed = (data_needed + DATA_BLOCK_SIZE - 1)
543 / DATA_BLOCK_SIZE;
481 size_t space, cmd_needed; 544 size_t space, cmd_needed;
482 u32 cmd_head; 545 u32 cmd_head;
483 546
@@ -501,13 +564,41 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
501 return false; 564 return false;
502 } 565 }
503 566
504 space = spc_bitmap_free(udev->data_bitmap); 567 /* try to check and get the data blocks as needed */
568 space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
505 if (space < data_needed) { 569 if (space < data_needed) {
506 pr_debug("no data space: only %zu available, but ask for %zu\n", 570 unsigned long blocks_left = DATA_BLOCK_BITS - udev->dbi_thresh;
507 space, data_needed); 571 unsigned long grow;
508 return false; 572
573 if (blocks_left < blocks_needed) {
574 pr_debug("no data space: only %lu available, but ask for %zu\n",
575 blocks_left * DATA_BLOCK_SIZE,
576 data_needed);
577 return false;
578 }
579
580 /* Try to expand the thresh */
581 if (!udev->dbi_thresh) {
582 /* From idle state */
583 uint32_t init_thresh = DATA_BLOCK_INIT_BITS;
584
585 udev->dbi_thresh = max(blocks_needed, init_thresh);
586 } else {
587 /*
588 * Grow the data area by max(blocks needed,
589 * dbi_thresh / 2), but limited to the max
590 * DATA_BLOCK_BITS size.
591 */
592 grow = max(blocks_needed, udev->dbi_thresh / 2);
593 udev->dbi_thresh += grow;
594 if (udev->dbi_thresh > DATA_BLOCK_BITS)
595 udev->dbi_thresh = DATA_BLOCK_BITS;
596 }
509 } 597 }
510 598
599 if (!tcmu_get_empty_blocks(udev, cmd))
600 return false;
601
511 return true; 602 return true;
512} 603}
513 604
@@ -544,7 +635,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
544 635
545 WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); 636 WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1));
546 637
547 spin_lock_irq(&udev->cmdr_lock); 638 mutex_lock(&udev->cmdr_lock);
548 639
549 mb = udev->mb_addr; 640 mb = udev->mb_addr;
550 cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ 641 cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
@@ -553,18 +644,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
553 pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu " 644 pr_warn("TCMU: Request of size %zu/%zu is too big for %u/%zu "
554 "cmd ring/data area\n", command_size, data_length, 645 "cmd ring/data area\n", command_size, data_length,
555 udev->cmdr_size, udev->data_size); 646 udev->cmdr_size, udev->data_size);
556 spin_unlock_irq(&udev->cmdr_lock); 647 mutex_unlock(&udev->cmdr_lock);
557 return TCM_INVALID_CDB_FIELD; 648 return TCM_INVALID_CDB_FIELD;
558 } 649 }
559 650
560 while (!is_ring_space_avail(udev, command_size, data_length)) { 651 while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
561 int ret; 652 int ret;
562 DEFINE_WAIT(__wait); 653 DEFINE_WAIT(__wait);
563 654
564 prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); 655 prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
565 656
566 pr_debug("sleeping for ring space\n"); 657 pr_debug("sleeping for ring space\n");
567 spin_unlock_irq(&udev->cmdr_lock); 658 mutex_unlock(&udev->cmdr_lock);
568 if (udev->cmd_time_out) 659 if (udev->cmd_time_out)
569 ret = schedule_timeout( 660 ret = schedule_timeout(
570 msecs_to_jiffies(udev->cmd_time_out)); 661 msecs_to_jiffies(udev->cmd_time_out));
@@ -576,7 +667,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
576 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 667 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
577 } 668 }
578 669
579 spin_lock_irq(&udev->cmdr_lock); 670 mutex_lock(&udev->cmdr_lock);
580 671
581 /* We dropped cmdr_lock, cmd_head is stale */ 672 /* We dropped cmdr_lock, cmd_head is stale */
582 cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ 673 cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
@@ -609,15 +700,18 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
609 entry->hdr.uflags = 0; 700 entry->hdr.uflags = 0;
610 701
611 /* Handle allocating space from the data area */ 702 /* Handle allocating space from the data area */
703 tcmu_cmd_reset_dbi_cur(tcmu_cmd);
612 iov = &entry->req.iov[0]; 704 iov = &entry->req.iov[0];
613 iov_cnt = 0; 705 iov_cnt = 0;
614 copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE 706 copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
615 || se_cmd->se_cmd_flags & SCF_BIDI); 707 || se_cmd->se_cmd_flags & SCF_BIDI);
616 ret = alloc_and_scatter_data_area(udev, tcmu_cmd, 708 ret = scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg,
617 se_cmd->t_data_sg, se_cmd->t_data_nents, 709 se_cmd->t_data_nents, &iov, &iov_cnt,
618 &iov, &iov_cnt, copy_to_data_area); 710 copy_to_data_area);
619 if (ret) { 711 if (ret) {
620 spin_unlock_irq(&udev->cmdr_lock); 712 tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
713 mutex_unlock(&udev->cmdr_lock);
714
621 pr_err("tcmu: alloc and scatter data failed\n"); 715 pr_err("tcmu: alloc and scatter data failed\n");
622 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 716 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
623 } 717 }
@@ -628,12 +722,14 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
628 if (se_cmd->se_cmd_flags & SCF_BIDI) { 722 if (se_cmd->se_cmd_flags & SCF_BIDI) {
629 iov_cnt = 0; 723 iov_cnt = 0;
630 iov++; 724 iov++;
631 ret = alloc_and_scatter_data_area(udev, tcmu_cmd, 725 ret = scatter_data_area(udev, tcmu_cmd,
632 se_cmd->t_bidi_data_sg, 726 se_cmd->t_bidi_data_sg,
633 se_cmd->t_bidi_data_nents, 727 se_cmd->t_bidi_data_nents,
634 &iov, &iov_cnt, false); 728 &iov, &iov_cnt, false);
635 if (ret) { 729 if (ret) {
636 spin_unlock_irq(&udev->cmdr_lock); 730 tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cnt);
731 mutex_unlock(&udev->cmdr_lock);
732
637 pr_err("tcmu: alloc and scatter bidi data failed\n"); 733 pr_err("tcmu: alloc and scatter bidi data failed\n");
638 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; 734 return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
639 } 735 }
@@ -648,8 +744,7 @@ tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd)
648 744
649 UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); 745 UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size);
650 tcmu_flush_dcache_range(mb, sizeof(*mb)); 746 tcmu_flush_dcache_range(mb, sizeof(*mb));
651 747 mutex_unlock(&udev->cmdr_lock);
652 spin_unlock_irq(&udev->cmdr_lock);
653 748
654 /* TODO: only if FLUSH and FUA? */ 749 /* TODO: only if FLUSH and FUA? */
655 uio_event_notify(&udev->uio_info); 750 uio_event_notify(&udev->uio_info);
@@ -723,14 +818,13 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
723 818
724out: 819out:
725 cmd->se_cmd = NULL; 820 cmd->se_cmd = NULL;
726 tcmu_cmd_free_data(cmd); 821 tcmu_cmd_free_data(cmd, cmd->dbi_cnt);
727 tcmu_free_cmd(cmd); 822 tcmu_free_cmd(cmd);
728} 823}
729 824
730static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) 825static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
731{ 826{
732 struct tcmu_mailbox *mb; 827 struct tcmu_mailbox *mb;
733 unsigned long flags;
734 int handled = 0; 828 int handled = 0;
735 829
736 if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { 830 if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) {
@@ -738,8 +832,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
738 return 0; 832 return 0;
739 } 833 }
740 834
741 spin_lock_irqsave(&udev->cmdr_lock, flags);
742
743 mb = udev->mb_addr; 835 mb = udev->mb_addr;
744 tcmu_flush_dcache_range(mb, sizeof(*mb)); 836 tcmu_flush_dcache_range(mb, sizeof(*mb));
745 837
@@ -780,8 +872,6 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
780 if (mb->cmd_tail == mb->cmd_head) 872 if (mb->cmd_tail == mb->cmd_head)
781 del_timer(&udev->timeout); /* no more pending cmds */ 873 del_timer(&udev->timeout); /* no more pending cmds */
782 874
783 spin_unlock_irqrestore(&udev->cmdr_lock, flags);
784
785 wake_up(&udev->wait_cmdr); 875 wake_up(&udev->wait_cmdr);
786 876
787 return handled; 877 return handled;
@@ -808,16 +898,14 @@ static void tcmu_device_timedout(unsigned long data)
808{ 898{
809 struct tcmu_dev *udev = (struct tcmu_dev *)data; 899 struct tcmu_dev *udev = (struct tcmu_dev *)data;
810 unsigned long flags; 900 unsigned long flags;
811 int handled;
812
813 handled = tcmu_handle_completions(udev);
814
815 pr_warn("%d completions handled from timeout\n", handled);
816 901
817 spin_lock_irqsave(&udev->commands_lock, flags); 902 spin_lock_irqsave(&udev->commands_lock, flags);
818 idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); 903 idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL);
819 spin_unlock_irqrestore(&udev->commands_lock, flags); 904 spin_unlock_irqrestore(&udev->commands_lock, flags);
820 905
906 /* Try to wake up the ummap thread */
907 wake_up(&unmap_wait);
908
821 /* 909 /*
822 * We don't need to wakeup threads on wait_cmdr since they have their 910 * We don't need to wakeup threads on wait_cmdr since they have their
823 * own timeout. 911 * own timeout.
@@ -862,7 +950,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
862 udev->cmd_time_out = TCMU_TIME_OUT; 950 udev->cmd_time_out = TCMU_TIME_OUT;
863 951
864 init_waitqueue_head(&udev->wait_cmdr); 952 init_waitqueue_head(&udev->wait_cmdr);
865 spin_lock_init(&udev->cmdr_lock); 953 mutex_init(&udev->cmdr_lock);
866 954
867 idr_init(&udev->commands); 955 idr_init(&udev->commands);
868 spin_lock_init(&udev->commands_lock); 956 spin_lock_init(&udev->commands_lock);
@@ -877,59 +965,13 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
877{ 965{
878 struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); 966 struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info);
879 967
968 mutex_lock(&tcmu_dev->cmdr_lock);
880 tcmu_handle_completions(tcmu_dev); 969 tcmu_handle_completions(tcmu_dev);
970 mutex_unlock(&tcmu_dev->cmdr_lock);
881 971
882 return 0; 972 return 0;
883} 973}
884 974
885static void tcmu_blocks_release(struct tcmu_dev *udev, bool release_pending)
886{
887 uint32_t dbi, end;
888 void *addr;
889
890 spin_lock_irq(&udev->cmdr_lock);
891
892 end = udev->dbi_max + 1;
893
894 /* try to release all unused blocks */
895 dbi = find_first_zero_bit(udev->data_bitmap, end);
896 if (dbi >= end) {
897 spin_unlock_irq(&udev->cmdr_lock);
898 return;
899 }
900 do {
901 addr = radix_tree_delete(&udev->data_blocks, dbi);
902 kfree(addr);
903
904 dbi = find_next_zero_bit(udev->data_bitmap, end, dbi + 1);
905 } while (dbi < end);
906
907 if (!release_pending)
908 return;
909
910 /* try to release all pending blocks */
911 dbi = find_first_bit(udev->data_bitmap, end);
912 if (dbi >= end) {
913 spin_unlock_irq(&udev->cmdr_lock);
914 return;
915 }
916 do {
917 addr = radix_tree_delete(&udev->data_blocks, dbi);
918 kfree(addr);
919
920 dbi = find_next_bit(udev->data_bitmap, end, dbi + 1);
921 } while (dbi < end);
922
923 spin_unlock_irq(&udev->cmdr_lock);
924}
925
926static void tcmu_vma_close(struct vm_area_struct *vma)
927{
928 struct tcmu_dev *udev = vma->vm_private_data;
929
930 tcmu_blocks_release(udev, false);
931}
932
933/* 975/*
934 * mmap code from uio.c. Copied here because we want to hook mmap() 976 * mmap code from uio.c. Copied here because we want to hook mmap()
935 * and this stuff must come along. 977 * and this stuff must come along.
@@ -947,6 +989,60 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
947 return -1; 989 return -1;
948} 990}
949 991
992static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
993{
994 struct page *page;
995 int ret;
996
997 mutex_lock(&udev->cmdr_lock);
998 page = tcmu_get_block_page(udev, dbi);
999 if (likely(page)) {
1000 mutex_unlock(&udev->cmdr_lock);
1001 return page;
1002 }
1003
1004 /*
1005 * Normally it shouldn't be here:
1006 * Only when the userspace has touched the blocks which
1007 * are out of the tcmu_cmd's data iov[], and will return
1008 * one zeroed page.
1009 */
1010 pr_warn("Block(%u) out of cmd's iov[] has been touched!\n", dbi);
1011 pr_warn("Mostly it will be a bug of userspace, please have a check!\n");
1012
1013 if (dbi >= udev->dbi_thresh) {
1014 /* Extern the udev->dbi_thresh to dbi + 1 */
1015 udev->dbi_thresh = dbi + 1;
1016 udev->dbi_max = dbi;
1017 }
1018
1019 page = radix_tree_lookup(&udev->data_blocks, dbi);
1020 if (!page) {
1021 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1022 if (!page) {
1023 mutex_unlock(&udev->cmdr_lock);
1024 return NULL;
1025 }
1026
1027 ret = radix_tree_insert(&udev->data_blocks, dbi, page);
1028 if (ret) {
1029 mutex_unlock(&udev->cmdr_lock);
1030 __free_page(page);
1031 return NULL;
1032 }
1033
1034 /*
1035 * Since this case is rare in page fault routine, here we
1036 * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS
1037 * to reduce possible page fault call trace.
1038 */
1039 atomic_inc(&global_db_count);
1040 }
1041 mutex_unlock(&udev->cmdr_lock);
1042
1043 return page;
1044}
1045
950static int tcmu_vma_fault(struct vm_fault *vmf) 1046static int tcmu_vma_fault(struct vm_fault *vmf)
951{ 1047{
952 struct tcmu_dev *udev = vmf->vma->vm_private_data; 1048 struct tcmu_dev *udev = vmf->vma->vm_private_data;
@@ -970,14 +1066,13 @@ static int tcmu_vma_fault(struct vm_fault *vmf)
970 addr = (void *)(unsigned long)info->mem[mi].addr + offset; 1066 addr = (void *)(unsigned long)info->mem[mi].addr + offset;
971 page = vmalloc_to_page(addr); 1067 page = vmalloc_to_page(addr);
972 } else { 1068 } else {
973 /* For the dynamically growing data area pages */
974 uint32_t dbi; 1069 uint32_t dbi;
975 1070
1071 /* For the dynamically growing data area pages */
976 dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE; 1072 dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE;
977 addr = tcmu_get_block_addr(udev, dbi); 1073 page = tcmu_try_get_block_page(udev, dbi);
978 if (!addr) 1074 if (!page)
979 return VM_FAULT_NOPAGE; 1075 return VM_FAULT_NOPAGE;
980 page = virt_to_page(addr);
981 } 1076 }
982 1077
983 get_page(page); 1078 get_page(page);
@@ -986,7 +1081,6 @@ static int tcmu_vma_fault(struct vm_fault *vmf)
986} 1081}
987 1082
988static const struct vm_operations_struct tcmu_vm_ops = { 1083static const struct vm_operations_struct tcmu_vm_ops = {
989 .close = tcmu_vma_close,
990 .fault = tcmu_vma_fault, 1084 .fault = tcmu_vma_fault,
991}; 1085};
992 1086
@@ -1014,6 +1108,8 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
1014 if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) 1108 if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
1015 return -EBUSY; 1109 return -EBUSY;
1016 1110
1111 udev->inode = inode;
1112
1017 pr_debug("open\n"); 1113 pr_debug("open\n");
1018 1114
1019 return 0; 1115 return 0;
@@ -1104,6 +1200,8 @@ static int tcmu_configure_device(struct se_device *dev)
1104 udev->cmdr_size = CMDR_SIZE - CMDR_OFF; 1200 udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
1105 udev->data_off = CMDR_SIZE; 1201 udev->data_off = CMDR_SIZE;
1106 udev->data_size = DATA_SIZE; 1202 udev->data_size = DATA_SIZE;
1203 udev->dbi_thresh = 0; /* Default in Idle state */
1204 udev->waiting_global = false;
1107 1205
1108 /* Initialise the mailbox of the ring buffer */ 1206 /* Initialise the mailbox of the ring buffer */
1109 mb = udev->mb_addr; 1207 mb = udev->mb_addr;
@@ -1116,7 +1214,7 @@ static int tcmu_configure_device(struct se_device *dev)
1116 WARN_ON(udev->data_size % PAGE_SIZE); 1214 WARN_ON(udev->data_size % PAGE_SIZE);
1117 WARN_ON(udev->data_size % DATA_BLOCK_SIZE); 1215 WARN_ON(udev->data_size % DATA_BLOCK_SIZE);
1118 1216
1119 INIT_RADIX_TREE(&udev->data_blocks, GFP_ATOMIC); 1217 INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
1120 1218
1121 info->version = __stringify(TCMU_MAILBOX_VERSION); 1219 info->version = __stringify(TCMU_MAILBOX_VERSION);
1122 1220
@@ -1149,6 +1247,10 @@ static int tcmu_configure_device(struct se_device *dev)
1149 if (ret) 1247 if (ret)
1150 goto err_netlink; 1248 goto err_netlink;
1151 1249
1250 mutex_lock(&root_udev_mutex);
1251 list_add(&udev->node, &root_udev);
1252 mutex_unlock(&root_udev_mutex);
1253
1152 return 0; 1254 return 0;
1153 1255
1154err_netlink: 1256err_netlink:
@@ -1183,6 +1285,23 @@ static bool tcmu_dev_configured(struct tcmu_dev *udev)
1183 return udev->uio_info.uio_dev ? true : false; 1285 return udev->uio_info.uio_dev ? true : false;
1184} 1286}
1185 1287
1288static void tcmu_blocks_release(struct tcmu_dev *udev)
1289{
1290 int i;
1291 struct page *page;
1292
1293 /* Try to release all block pages */
1294 mutex_lock(&udev->cmdr_lock);
1295 for (i = 0; i <= udev->dbi_max; i++) {
1296 page = radix_tree_delete(&udev->data_blocks, i);
1297 if (page) {
1298 __free_page(page);
1299 atomic_dec(&global_db_count);
1300 }
1301 }
1302 mutex_unlock(&udev->cmdr_lock);
1303}
1304
1186static void tcmu_free_device(struct se_device *dev) 1305static void tcmu_free_device(struct se_device *dev)
1187{ 1306{
1188 struct tcmu_dev *udev = TCMU_DEV(dev); 1307 struct tcmu_dev *udev = TCMU_DEV(dev);
@@ -1192,6 +1311,10 @@ static void tcmu_free_device(struct se_device *dev)
1192 1311
1193 del_timer_sync(&udev->timeout); 1312 del_timer_sync(&udev->timeout);
1194 1313
1314 mutex_lock(&root_udev_mutex);
1315 list_del(&udev->node);
1316 mutex_unlock(&root_udev_mutex);
1317
1195 vfree(udev->mb_addr); 1318 vfree(udev->mb_addr);
1196 1319
1197 /* Upper layer should drain all requests before calling this */ 1320 /* Upper layer should drain all requests before calling this */
@@ -1204,7 +1327,7 @@ static void tcmu_free_device(struct se_device *dev)
1204 spin_unlock_irq(&udev->commands_lock); 1327 spin_unlock_irq(&udev->commands_lock);
1205 WARN_ON(!all_expired); 1328 WARN_ON(!all_expired);
1206 1329
1207 tcmu_blocks_release(udev, true); 1330 tcmu_blocks_release(udev);
1208 1331
1209 if (tcmu_dev_configured(udev)) { 1332 if (tcmu_dev_configured(udev)) {
1210 tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, 1333 tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name,
@@ -1392,6 +1515,81 @@ static struct target_backend_ops tcmu_ops = {
1392 .tb_dev_attrib_attrs = NULL, 1515 .tb_dev_attrib_attrs = NULL,
1393}; 1516};
1394 1517
1518static int unmap_thread_fn(void *data)
1519{
1520 struct tcmu_dev *udev;
1521 loff_t off;
1522 uint32_t start, end, block;
1523 struct page *page;
1524 int i;
1525
1526 while (1) {
1527 DEFINE_WAIT(__wait);
1528
1529 prepare_to_wait(&unmap_wait, &__wait, TASK_INTERRUPTIBLE);
1530 schedule();
1531 finish_wait(&unmap_wait, &__wait);
1532
1533 mutex_lock(&root_udev_mutex);
1534 list_for_each_entry(udev, &root_udev, node) {
1535 mutex_lock(&udev->cmdr_lock);
1536
1537 /* Try to complete the finished commands first */
1538 tcmu_handle_completions(udev);
1539
1540 /* Skip the udevs waiting the global pool or in idle */
1541 if (udev->waiting_global || !udev->dbi_thresh) {
1542 mutex_unlock(&udev->cmdr_lock);
1543 continue;
1544 }
1545
1546 end = udev->dbi_max + 1;
1547 block = find_last_bit(udev->data_bitmap, end);
1548 if (block == udev->dbi_max) {
1549 /*
1550 * The last bit is dbi_max, so there is
1551 * no need to shrink any blocks.
1552 */
1553 mutex_unlock(&udev->cmdr_lock);
1554 continue;
1555 } else if (block == end) {
1556 /* The current udev will goto idle state */
1557 udev->dbi_thresh = start = 0;
1558 udev->dbi_max = 0;
1559 } else {
1560 udev->dbi_thresh = start = block + 1;
1561 udev->dbi_max = block;
1562 }
1563
1564 /* Here will truncate the data area from off */
1565 off = udev->data_off + start * DATA_BLOCK_SIZE;
1566 unmap_mapping_range(udev->inode->i_mapping, off, 0, 1);
1567
1568 /* Release the block pages */
1569 for (i = start; i < end; i++) {
1570 page = radix_tree_delete(&udev->data_blocks, i);
1571 if (page) {
1572 __free_page(page);
1573 atomic_dec(&global_db_count);
1574 }
1575 }
1576 mutex_unlock(&udev->cmdr_lock);
1577 }
1578
1579 /*
1580 * Try to wake up the udevs who are waiting
1581 * for the global data pool.
1582 */
1583 list_for_each_entry(udev, &root_udev, node) {
1584 if (udev->waiting_global)
1585 wake_up(&udev->wait_cmdr);
1586 }
1587 mutex_unlock(&root_udev_mutex);
1588 }
1589
1590 return 0;
1591}
1592
1395static int __init tcmu_module_init(void) 1593static int __init tcmu_module_init(void)
1396{ 1594{
1397 int ret, i, len = 0; 1595 int ret, i, len = 0;
@@ -1437,8 +1635,17 @@ static int __init tcmu_module_init(void)
1437 if (ret) 1635 if (ret)
1438 goto out_attrs; 1636 goto out_attrs;
1439 1637
1638 init_waitqueue_head(&unmap_wait);
1639 unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap");
1640 if (IS_ERR(unmap_thread)) {
1641 ret = PTR_ERR(unmap_thread);
1642 goto out_unreg_transport;
1643 }
1644
1440 return 0; 1645 return 0;
1441 1646
1647out_unreg_transport:
1648 target_backend_unregister(&tcmu_ops);
1442out_attrs: 1649out_attrs:
1443 kfree(tcmu_attrs); 1650 kfree(tcmu_attrs);
1444out_unreg_genl: 1651out_unreg_genl:
@@ -1453,6 +1660,7 @@ out_free_cache:
1453 1660
1454static void __exit tcmu_module_exit(void) 1661static void __exit tcmu_module_exit(void)
1455{ 1662{
1663 kthread_stop(unmap_thread);
1456 target_backend_unregister(&tcmu_ops); 1664 target_backend_unregister(&tcmu_ops);
1457 kfree(tcmu_attrs); 1665 kfree(tcmu_attrs);
1458 genl_unregister_family(&tcmu_genl_family); 1666 genl_unregister_family(&tcmu_genl_family);