diff options
author | Or Gerlitz <ogerlitz@voltaire.com> | 2006-05-11 03:03:08 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2006-06-22 10:51:12 -0400 |
commit | 6461f64ab51e6929680df064b2682004a1548290 (patch) | |
tree | 52922f1830a6f6ed7fe546db9230a507f91974a7 /drivers/infiniband | |
parent | 1cfa0a75dbef1d5bf687aacafabb023288f6b36a (diff) |
IB/iser: iSER handling of memory for RDMA
This file contains the processing carried over an SG list associated with
a SCSI command such that it can be registered with the IB verbs. The
registration produces a network virtual address (VA) and a remote access
key (RKEY or STAG in iSER spec notation) which are used by the target for
its RDMA operation.
Signed-off-by: Or Gerlitz <ogerlitz@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/ulp/iser/iser_memory.c | 401 |
1 files changed, 401 insertions, 0 deletions
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c new file mode 100644 index 000000000000..31950a522a1c --- /dev/null +++ b/drivers/infiniband/ulp/iser/iser_memory.c | |||
@@ -0,0 +1,401 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $ | ||
33 | */ | ||
34 | #include <linux/module.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/mm.h> | ||
38 | #include <asm/io.h> | ||
39 | #include <asm/scatterlist.h> | ||
40 | #include <linux/scatterlist.h> | ||
41 | |||
42 | #include "iscsi_iser.h" | ||
43 | |||
44 | #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ | ||
45 | /** | ||
46 | * Decrements the reference count for the | ||
47 | * registered buffer & releases it | ||
48 | * | ||
49 | * returns 0 if released, 1 if deferred | ||
50 | */ | ||
51 | int iser_regd_buff_release(struct iser_regd_buf *regd_buf) | ||
52 | { | ||
53 | struct device *dma_device; | ||
54 | |||
55 | if ((atomic_read(®d_buf->ref_count) == 0) || | ||
56 | atomic_dec_and_test(®d_buf->ref_count)) { | ||
57 | /* if we used the dma mr, unreg is just NOP */ | ||
58 | if (regd_buf->reg.rkey != 0) | ||
59 | iser_unreg_mem(®d_buf->reg); | ||
60 | |||
61 | if (regd_buf->dma_addr) { | ||
62 | dma_device = regd_buf->device->ib_device->dma_device; | ||
63 | dma_unmap_single(dma_device, | ||
64 | regd_buf->dma_addr, | ||
65 | regd_buf->data_size, | ||
66 | regd_buf->direction); | ||
67 | } | ||
68 | /* else this regd buf is associated with task which we */ | ||
69 | /* dma_unmap_single/sg later */ | ||
70 | return 0; | ||
71 | } else { | ||
72 | iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); | ||
73 | return 1; | ||
74 | } | ||
75 | } | ||
76 | |||
77 | /** | ||
78 | * iser_reg_single - fills registered buffer descriptor with | ||
79 | * registration information | ||
80 | */ | ||
81 | void iser_reg_single(struct iser_device *device, | ||
82 | struct iser_regd_buf *regd_buf, | ||
83 | enum dma_data_direction direction) | ||
84 | { | ||
85 | dma_addr_t dma_addr; | ||
86 | |||
87 | dma_addr = dma_map_single(device->ib_device->dma_device, | ||
88 | regd_buf->virt_addr, | ||
89 | regd_buf->data_size, direction); | ||
90 | BUG_ON(dma_mapping_error(dma_addr)); | ||
91 | |||
92 | regd_buf->reg.lkey = device->mr->lkey; | ||
93 | regd_buf->reg.rkey = 0; /* indicate there's no need to unreg */ | ||
94 | regd_buf->reg.len = regd_buf->data_size; | ||
95 | regd_buf->reg.va = dma_addr; | ||
96 | |||
97 | regd_buf->dma_addr = dma_addr; | ||
98 | regd_buf->direction = direction; | ||
99 | } | ||
100 | |||
101 | /** | ||
102 | * iser_start_rdma_unaligned_sg | ||
103 | */ | ||
104 | int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, | ||
105 | enum iser_data_dir cmd_dir) | ||
106 | { | ||
107 | int dma_nents; | ||
108 | struct device *dma_device; | ||
109 | char *mem = NULL; | ||
110 | struct iser_data_buf *data = &iser_ctask->data[cmd_dir]; | ||
111 | unsigned long cmd_data_len = data->data_len; | ||
112 | |||
113 | if (cmd_data_len > ISER_KMALLOC_THRESHOLD) | ||
114 | mem = (void *)__get_free_pages(GFP_NOIO, | ||
115 | long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); | ||
116 | else | ||
117 | mem = kmalloc(cmd_data_len, GFP_NOIO); | ||
118 | |||
119 | if (mem == NULL) { | ||
120 | iser_err("Failed to allocate mem size %d %d for copying sglist\n", | ||
121 | data->size,(int)cmd_data_len); | ||
122 | return -ENOMEM; | ||
123 | } | ||
124 | |||
125 | if (cmd_dir == ISER_DIR_OUT) { | ||
126 | /* copy the unaligned sg the buffer which is used for RDMA */ | ||
127 | struct scatterlist *sg = (struct scatterlist *)data->buf; | ||
128 | int i; | ||
129 | char *p, *from; | ||
130 | |||
131 | for (p = mem, i = 0; i < data->size; i++) { | ||
132 | from = kmap_atomic(sg[i].page, KM_USER0); | ||
133 | memcpy(p, | ||
134 | from + sg[i].offset, | ||
135 | sg[i].length); | ||
136 | kunmap_atomic(from, KM_USER0); | ||
137 | p += sg[i].length; | ||
138 | } | ||
139 | } | ||
140 | |||
141 | sg_init_one(&iser_ctask->data_copy[cmd_dir].sg_single, mem, cmd_data_len); | ||
142 | iser_ctask->data_copy[cmd_dir].buf = | ||
143 | &iser_ctask->data_copy[cmd_dir].sg_single; | ||
144 | iser_ctask->data_copy[cmd_dir].size = 1; | ||
145 | |||
146 | iser_ctask->data_copy[cmd_dir].copy_buf = mem; | ||
147 | |||
148 | dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; | ||
149 | |||
150 | if (cmd_dir == ISER_DIR_OUT) | ||
151 | dma_nents = dma_map_sg(dma_device, | ||
152 | &iser_ctask->data_copy[cmd_dir].sg_single, | ||
153 | 1, DMA_TO_DEVICE); | ||
154 | else | ||
155 | dma_nents = dma_map_sg(dma_device, | ||
156 | &iser_ctask->data_copy[cmd_dir].sg_single, | ||
157 | 1, DMA_FROM_DEVICE); | ||
158 | |||
159 | BUG_ON(dma_nents == 0); | ||
160 | |||
161 | iser_ctask->data_copy[cmd_dir].dma_nents = dma_nents; | ||
162 | return 0; | ||
163 | } | ||
164 | |||
165 | /** | ||
166 | * iser_finalize_rdma_unaligned_sg | ||
167 | */ | ||
168 | void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, | ||
169 | enum iser_data_dir cmd_dir) | ||
170 | { | ||
171 | struct device *dma_device; | ||
172 | struct iser_data_buf *mem_copy; | ||
173 | unsigned long cmd_data_len; | ||
174 | |||
175 | dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; | ||
176 | mem_copy = &iser_ctask->data_copy[cmd_dir]; | ||
177 | |||
178 | if (cmd_dir == ISER_DIR_OUT) | ||
179 | dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, | ||
180 | DMA_TO_DEVICE); | ||
181 | else | ||
182 | dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, | ||
183 | DMA_FROM_DEVICE); | ||
184 | |||
185 | if (cmd_dir == ISER_DIR_IN) { | ||
186 | char *mem; | ||
187 | struct scatterlist *sg; | ||
188 | unsigned char *p, *to; | ||
189 | unsigned int sg_size; | ||
190 | int i; | ||
191 | |||
192 | /* copy back read RDMA to unaligned sg */ | ||
193 | mem = mem_copy->copy_buf; | ||
194 | |||
195 | sg = (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf; | ||
196 | sg_size = iser_ctask->data[ISER_DIR_IN].size; | ||
197 | |||
198 | for (p = mem, i = 0; i < sg_size; i++){ | ||
199 | to = kmap_atomic(sg[i].page, KM_SOFTIRQ0); | ||
200 | memcpy(to + sg[i].offset, | ||
201 | p, | ||
202 | sg[i].length); | ||
203 | kunmap_atomic(to, KM_SOFTIRQ0); | ||
204 | p += sg[i].length; | ||
205 | } | ||
206 | } | ||
207 | |||
208 | cmd_data_len = iser_ctask->data[cmd_dir].data_len; | ||
209 | |||
210 | if (cmd_data_len > ISER_KMALLOC_THRESHOLD) | ||
211 | free_pages((unsigned long)mem_copy->copy_buf, | ||
212 | long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); | ||
213 | else | ||
214 | kfree(mem_copy->copy_buf); | ||
215 | |||
216 | mem_copy->copy_buf = NULL; | ||
217 | } | ||
218 | |||
219 | /** | ||
220 | * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses | ||
221 | * and returns the length of resulting physical address array (may be less than | ||
222 | * the original due to possible compaction). | ||
223 | * | ||
224 | * we build a "page vec" under the assumption that the SG meets the RDMA | ||
225 | * alignment requirements. Other then the first and last SG elements, all | ||
226 | * the "internal" elements can be compacted into a list whose elements are | ||
227 | * dma addresses of physical pages. The code supports also the weird case | ||
228 | * where --few fragments of the same page-- are present in the SG as | ||
229 | * consecutive elements. Also, it handles one entry SG. | ||
230 | */ | ||
231 | static int iser_sg_to_page_vec(struct iser_data_buf *data, | ||
232 | struct iser_page_vec *page_vec) | ||
233 | { | ||
234 | struct scatterlist *sg = (struct scatterlist *)data->buf; | ||
235 | dma_addr_t first_addr, last_addr, page; | ||
236 | int start_aligned, end_aligned; | ||
237 | unsigned int cur_page = 0; | ||
238 | unsigned long total_sz = 0; | ||
239 | int i; | ||
240 | |||
241 | /* compute the offset of first element */ | ||
242 | page_vec->offset = (u64) sg[0].offset; | ||
243 | |||
244 | for (i = 0; i < data->dma_nents; i++) { | ||
245 | total_sz += sg_dma_len(&sg[i]); | ||
246 | |||
247 | first_addr = sg_dma_address(&sg[i]); | ||
248 | last_addr = first_addr + sg_dma_len(&sg[i]); | ||
249 | |||
250 | start_aligned = !(first_addr & ~PAGE_MASK); | ||
251 | end_aligned = !(last_addr & ~PAGE_MASK); | ||
252 | |||
253 | /* continue to collect page fragments till aligned or SG ends */ | ||
254 | while (!end_aligned && (i + 1 < data->dma_nents)) { | ||
255 | i++; | ||
256 | total_sz += sg_dma_len(&sg[i]); | ||
257 | last_addr = sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]); | ||
258 | end_aligned = !(last_addr & ~PAGE_MASK); | ||
259 | } | ||
260 | |||
261 | first_addr = first_addr & PAGE_MASK; | ||
262 | |||
263 | for (page = first_addr; page < last_addr; page += PAGE_SIZE) | ||
264 | page_vec->pages[cur_page++] = page; | ||
265 | |||
266 | } | ||
267 | page_vec->data_size = total_sz; | ||
268 | iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); | ||
269 | return cur_page; | ||
270 | } | ||
271 | |||
272 | #define MASK_4K ((1UL << 12) - 1) /* 0xFFF */ | ||
273 | #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & MASK_4K) == 0) | ||
274 | |||
275 | /** | ||
276 | * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned | ||
277 | * for RDMA sub-list of a scatter-gather list of memory buffers, and returns | ||
278 | * the number of entries which are aligned correctly. Supports the case where | ||
279 | * consecutive SG elements are actually fragments of the same physcial page. | ||
280 | */ | ||
281 | static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data) | ||
282 | { | ||
283 | struct scatterlist *sg; | ||
284 | dma_addr_t end_addr, next_addr; | ||
285 | int i, cnt; | ||
286 | unsigned int ret_len = 0; | ||
287 | |||
288 | sg = (struct scatterlist *)data->buf; | ||
289 | |||
290 | for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { | ||
291 | /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " | ||
292 | "offset: %ld sz: %ld\n", i, | ||
293 | (unsigned long)page_to_phys(sg[i].page), | ||
294 | (unsigned long)sg[i].offset, | ||
295 | (unsigned long)sg[i].length); */ | ||
296 | end_addr = sg_dma_address(&sg[i]) + | ||
297 | sg_dma_len(&sg[i]); | ||
298 | /* iser_dbg("Checking sg iobuf end address " | ||
299 | "0x%08lX\n", end_addr); */ | ||
300 | if (i + 1 < data->dma_nents) { | ||
301 | next_addr = sg_dma_address(&sg[i+1]); | ||
302 | /* are i, i+1 fragments of the same page? */ | ||
303 | if (end_addr == next_addr) | ||
304 | continue; | ||
305 | else if (!IS_4K_ALIGNED(end_addr)) { | ||
306 | ret_len = cnt + 1; | ||
307 | break; | ||
308 | } | ||
309 | } | ||
310 | } | ||
311 | if (i == data->dma_nents) | ||
312 | ret_len = cnt; /* loop ended */ | ||
313 | iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", | ||
314 | ret_len, data->dma_nents, data); | ||
315 | return ret_len; | ||
316 | } | ||
317 | |||
318 | static void iser_data_buf_dump(struct iser_data_buf *data) | ||
319 | { | ||
320 | struct scatterlist *sg = (struct scatterlist *)data->buf; | ||
321 | int i; | ||
322 | |||
323 | for (i = 0; i < data->size; i++) | ||
324 | iser_err("sg[%d] dma_addr:0x%lX page:0x%p " | ||
325 | "off:%d sz:%d dma_len:%d\n", | ||
326 | i, (unsigned long)sg_dma_address(&sg[i]), | ||
327 | sg[i].page, sg[i].offset, | ||
328 | sg[i].length,sg_dma_len(&sg[i])); | ||
329 | } | ||
330 | |||
331 | static void iser_dump_page_vec(struct iser_page_vec *page_vec) | ||
332 | { | ||
333 | int i; | ||
334 | |||
335 | iser_err("page vec length %d data size %d\n", | ||
336 | page_vec->length, page_vec->data_size); | ||
337 | for (i = 0; i < page_vec->length; i++) | ||
338 | iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); | ||
339 | } | ||
340 | |||
341 | static void iser_page_vec_build(struct iser_data_buf *data, | ||
342 | struct iser_page_vec *page_vec) | ||
343 | { | ||
344 | int page_vec_len = 0; | ||
345 | |||
346 | page_vec->length = 0; | ||
347 | page_vec->offset = 0; | ||
348 | |||
349 | iser_dbg("Translating sg sz: %d\n", data->dma_nents); | ||
350 | page_vec_len = iser_sg_to_page_vec(data,page_vec); | ||
351 | iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); | ||
352 | |||
353 | page_vec->length = page_vec_len; | ||
354 | |||
355 | if (page_vec_len * PAGE_SIZE < page_vec->data_size) { | ||
356 | iser_err("page_vec too short to hold this SG\n"); | ||
357 | iser_data_buf_dump(data); | ||
358 | iser_dump_page_vec(page_vec); | ||
359 | BUG(); | ||
360 | } | ||
361 | } | ||
362 | |||
363 | /** | ||
364 | * iser_reg_rdma_mem - Registers memory intended for RDMA, | ||
365 | * obtaining rkey and va | ||
366 | * | ||
367 | * returns 0 on success, errno code on failure | ||
368 | */ | ||
369 | int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, | ||
370 | enum iser_data_dir cmd_dir) | ||
371 | { | ||
372 | struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; | ||
373 | struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; | ||
374 | struct iser_regd_buf *regd_buf; | ||
375 | int aligned_len; | ||
376 | int err; | ||
377 | |||
378 | regd_buf = &iser_ctask->rdma_regd[cmd_dir]; | ||
379 | |||
380 | aligned_len = iser_data_buf_aligned_len(mem); | ||
381 | if (aligned_len != mem->size) { | ||
382 | iser_err("rdma alignment violation %d/%d aligned\n", | ||
383 | aligned_len, mem->size); | ||
384 | iser_data_buf_dump(mem); | ||
385 | /* allocate copy buf, if we are writing, copy the */ | ||
386 | /* unaligned scatterlist, dma map the copy */ | ||
387 | if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) | ||
388 | return -ENOMEM; | ||
389 | mem = &iser_ctask->data_copy[cmd_dir]; | ||
390 | } | ||
391 | |||
392 | iser_page_vec_build(mem, ib_conn->page_vec); | ||
393 | err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); | ||
394 | if (err) | ||
395 | return err; | ||
396 | |||
397 | /* take a reference on this regd buf such that it will not be released * | ||
398 | * (eg in send dto completion) before we get the scsi response */ | ||
399 | atomic_inc(®d_buf->ref_count); | ||
400 | return 0; | ||
401 | } | ||