diff options
author | Andy Grover <agrover@redhat.com> | 2014-10-01 19:07:05 -0400 |
---|---|---|
committer | Nicholas Bellinger <nab@linux-iscsi.org> | 2014-10-03 14:15:20 -0400 |
commit | 7c9e7a6fe11c8dc5b3b9d0e889dde73347247584 (patch) | |
tree | 9c4c3a753228617e308226f47dc6d4fe83ddf15d /include/uapi | |
parent | ce87685128f3e0fced2aca9f73fc8cc67704ae11 (diff) |
target: Add a user-passthrough backstore
Add a LIO storage engine that presents commands to userspace for execution.
This would allow more complex backstores to be implemented out-of-kernel,
and also make experimentation a-la FUSE (but at the SCSI level -- "SUSE"?)
possible.
It uses a mmap()able UIO device per LUN to share a command ring and data
area. The commands are raw SCSI CDBs and iovs for in/out data. The command
ring is also reused for returning scsi command status and optional sense
data.
This implementation is based on Shaohua Li's earlier version but heavily
modified. Differences include:
* Shared memory allocated by kernel, not locked-down user pages
* Single ring for command request and response
* Offsets instead of embedded pointers
* Generic SCSI CDB passthrough instead of per-cmd specialization in ring
format.
* Uses UIO device instead of anon_file passed in mailbox.
* Optional in-kernel handling of some commands.
The main reason for these differences is to permit greater resiliency
if the user process dies or hangs.
Things not yet implemented (on purpose):
* Zero copy. The data area is flexible enough to allow page flipping or
backend-allocated pages to be used by fabrics, but it's not clear these
are performance wins. Can come later.
* Out-of-order command completion by userspace. Possible to add by just
allowing userspace to change cmd_id in rsp cmd entries, but currently
not supported.
* No locks between kernel cmd submission and completion routines. Sounds
like it's possible, but this can come later.
* Sparse allocation of mmaped area. Current code vmallocs the whole thing.
If the mapped area was larger and not fully mapped then the driver would
have more freedom to change cmd and data area sizes based on demand.
Current code open issues:
* The use of idrs may be overkill -- we maybe can replace them with a
simple counter to generate cmd_ids, and a hash table to get a cmd_id's
associated pointer.
* Use of a free-running counter for cmd ring instead of explicit modulo
math. This would require power-of-2 cmd ring size.
(Add kconfig depends NET - Randy)
Signed-off-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
Diffstat (limited to 'include/uapi')
-rw-r--r-- | include/uapi/linux/Kbuild | 1 | ||||
-rw-r--r-- | include/uapi/linux/target_core_user.h | 142 |
2 files changed, 143 insertions, 0 deletions
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index be88166349a1..6ebd0d1faf2e 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild | |||
@@ -371,6 +371,7 @@ header-y += swab.h | |||
371 | header-y += synclink.h | 371 | header-y += synclink.h |
372 | header-y += sysctl.h | 372 | header-y += sysctl.h |
373 | header-y += sysinfo.h | 373 | header-y += sysinfo.h |
374 | header-y += target_core_user.h | ||
374 | header-y += taskstats.h | 375 | header-y += taskstats.h |
375 | header-y += tcp.h | 376 | header-y += tcp.h |
376 | header-y += tcp_metrics.h | 377 | header-y += tcp_metrics.h |
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h new file mode 100644 index 000000000000..7dcfbe6771b1 --- /dev/null +++ b/include/uapi/linux/target_core_user.h | |||
@@ -0,0 +1,142 @@ | |||
1 | #ifndef __TARGET_CORE_USER_H | ||
2 | #define __TARGET_CORE_USER_H | ||
3 | |||
4 | /* This header will be used by application too */ | ||
5 | |||
6 | #include <linux/types.h> | ||
7 | #include <linux/uio.h> | ||
8 | |||
9 | #ifndef __packed | ||
10 | #define __packed __attribute__((packed)) | ||
11 | #endif | ||
12 | |||
13 | #define TCMU_VERSION "1.0" | ||
14 | |||
15 | /* | ||
16 | * Ring Design | ||
17 | * ----------- | ||
18 | * | ||
19 | * The mmaped area is divided into three parts: | ||
20 | * 1) The mailbox (struct tcmu_mailbox, below) | ||
21 | * 2) The command ring | ||
22 | * 3) Everything beyond the command ring (data) | ||
23 | * | ||
24 | * The mailbox tells userspace the offset of the command ring from the | ||
25 | * start of the shared memory region, and how big the command ring is. | ||
26 | * | ||
27 | * The kernel passes SCSI commands to userspace by putting a struct | ||
28 | * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking | ||
29 | * userspace via uio's interrupt mechanism. | ||
30 | * | ||
31 | * tcmu_cmd_entry contains a header. If the header type is PAD, | ||
32 | * userspace should skip hdr->length bytes (mod cmdr_size) to find the | ||
33 | * next cmd_entry. | ||
34 | * | ||
35 | * Otherwise, the entry will contain offsets into the mmaped area that | ||
36 | * contain the cdb and data buffers -- the latter accessible via the | ||
37 | * iov array. iov addresses are also offsets into the shared area. | ||
38 | * | ||
39 | * When userspace is completed handling the command, set | ||
40 | * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate, | ||
41 | * and also set mailbox->cmd_tail equal to the old cmd_tail plus | ||
42 | * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it | ||
43 | * should process the next packet the same way, and so on. | ||
44 | */ | ||
45 | |||
46 | #define TCMU_MAILBOX_VERSION 1 | ||
47 | #define ALIGN_SIZE 64 /* Should be enough for most CPUs */ | ||
48 | |||
49 | struct tcmu_mailbox { | ||
50 | __u16 version; | ||
51 | __u16 flags; | ||
52 | __u32 cmdr_off; | ||
53 | __u32 cmdr_size; | ||
54 | |||
55 | __u32 cmd_head; | ||
56 | |||
57 | /* Updated by user. On its own cacheline */ | ||
58 | __u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE))); | ||
59 | |||
60 | } __packed; | ||
61 | |||
62 | enum tcmu_opcode { | ||
63 | TCMU_OP_PAD = 0, | ||
64 | TCMU_OP_CMD, | ||
65 | }; | ||
66 | |||
67 | /* | ||
68 | * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. | ||
69 | */ | ||
70 | struct tcmu_cmd_entry_hdr { | ||
71 | __u32 len_op; | ||
72 | } __packed; | ||
73 | |||
74 | #define TCMU_OP_MASK 0x7 | ||
75 | |||
76 | static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) | ||
77 | { | ||
78 | return hdr->len_op & TCMU_OP_MASK; | ||
79 | } | ||
80 | |||
81 | static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) | ||
82 | { | ||
83 | hdr->len_op &= ~TCMU_OP_MASK; | ||
84 | hdr->len_op |= (op & TCMU_OP_MASK); | ||
85 | } | ||
86 | |||
87 | static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) | ||
88 | { | ||
89 | return hdr->len_op & ~TCMU_OP_MASK; | ||
90 | } | ||
91 | |||
92 | static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) | ||
93 | { | ||
94 | hdr->len_op &= TCMU_OP_MASK; | ||
95 | hdr->len_op |= len; | ||
96 | } | ||
97 | |||
98 | /* Currently the same as SCSI_SENSE_BUFFERSIZE */ | ||
99 | #define TCMU_SENSE_BUFFERSIZE 96 | ||
100 | |||
101 | struct tcmu_cmd_entry { | ||
102 | struct tcmu_cmd_entry_hdr hdr; | ||
103 | |||
104 | uint16_t cmd_id; | ||
105 | uint16_t __pad1; | ||
106 | |||
107 | union { | ||
108 | struct { | ||
109 | uint64_t cdb_off; | ||
110 | uint64_t iov_cnt; | ||
111 | struct iovec iov[0]; | ||
112 | } req; | ||
113 | struct { | ||
114 | uint8_t scsi_status; | ||
115 | uint8_t __pad1; | ||
116 | uint16_t __pad2; | ||
117 | uint32_t __pad3; | ||
118 | char sense_buffer[TCMU_SENSE_BUFFERSIZE]; | ||
119 | } rsp; | ||
120 | }; | ||
121 | |||
122 | } __packed; | ||
123 | |||
124 | #define TCMU_OP_ALIGN_SIZE sizeof(uint64_t) | ||
125 | |||
126 | enum tcmu_genl_cmd { | ||
127 | TCMU_CMD_UNSPEC, | ||
128 | TCMU_CMD_ADDED_DEVICE, | ||
129 | TCMU_CMD_REMOVED_DEVICE, | ||
130 | __TCMU_CMD_MAX, | ||
131 | }; | ||
132 | #define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) | ||
133 | |||
134 | enum tcmu_genl_attr { | ||
135 | TCMU_ATTR_UNSPEC, | ||
136 | TCMU_ATTR_DEVICE, | ||
137 | TCMU_ATTR_MINOR, | ||
138 | __TCMU_ATTR_MAX, | ||
139 | }; | ||
140 | #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) | ||
141 | |||
142 | #endif | ||