diff options
author | Roland Dreier <rolandd@cisco.com> | 2005-07-07 20:57:14 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-07-07 21:23:48 -0400 |
commit | eb8ffbfed50e7945c024a80e3688d5beffa3b641 (patch) | |
tree | 6b32628db0560e2f093efc50e7a0630b9bb678d0 | |
parent | bc38a6abdd5a50e007d0fcd9b9b6280132b79e62 (diff) |
[PATCH] IB uverbs: memory pinning implementation
Add support for pinning userspace memory regions and returning a list of pages
in the region. This includes tracking pinned memory against vm_locked and
preventing unprivileged users from exceeding RLIMIT_MEMLOCK.
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | drivers/infiniband/core/uverbs_mem.c | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/drivers/infiniband/core/uverbs_mem.c b/drivers/infiniband/core/uverbs_mem.c new file mode 100644 index 000000000000..ed550f6595bd --- /dev/null +++ b/drivers/infiniband/core/uverbs_mem.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005 Topspin Communications. All rights reserved. | ||
3 | * Copyright (c) 2005 Cisco Systems. All rights reserved. | ||
4 | * | ||
5 | * This software is available to you under a choice of one of two | ||
6 | * licenses. You may choose to be licensed under the terms of the GNU | ||
7 | * General Public License (GPL) Version 2, available from the file | ||
8 | * COPYING in the main directory of this source tree, or the | ||
9 | * OpenIB.org BSD license below: | ||
10 | * | ||
11 | * Redistribution and use in source and binary forms, with or | ||
12 | * without modification, are permitted provided that the following | ||
13 | * conditions are met: | ||
14 | * | ||
15 | * - Redistributions of source code must retain the above | ||
16 | * copyright notice, this list of conditions and the following | ||
17 | * disclaimer. | ||
18 | * | ||
19 | * - Redistributions in binary form must reproduce the above | ||
20 | * copyright notice, this list of conditions and the following | ||
21 | * disclaimer in the documentation and/or other materials | ||
22 | * provided with the distribution. | ||
23 | * | ||
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
25 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
26 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
27 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
28 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
29 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
30 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
31 | * SOFTWARE. | ||
32 | * | ||
33 | * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $ | ||
34 | */ | ||
35 | |||
36 | #include <linux/mm.h> | ||
37 | #include <linux/dma-mapping.h> | ||
38 | |||
39 | #include "uverbs.h" | ||
40 | |||
41 | struct ib_umem_account_work { | ||
42 | struct work_struct work; | ||
43 | struct mm_struct *mm; | ||
44 | unsigned long diff; | ||
45 | }; | ||
46 | |||
47 | |||
48 | static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) | ||
49 | { | ||
50 | struct ib_umem_chunk *chunk, *tmp; | ||
51 | int i; | ||
52 | |||
53 | list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { | ||
54 | dma_unmap_sg(dev->dma_device, chunk->page_list, | ||
55 | chunk->nents, DMA_BIDIRECTIONAL); | ||
56 | for (i = 0; i < chunk->nents; ++i) { | ||
57 | if (umem->writable && dirty) | ||
58 | set_page_dirty_lock(chunk->page_list[i].page); | ||
59 | put_page(chunk->page_list[i].page); | ||
60 | } | ||
61 | |||
62 | kfree(chunk); | ||
63 | } | ||
64 | } | ||
65 | |||
66 | int ib_umem_get(struct ib_device *dev, struct ib_umem *mem, | ||
67 | void *addr, size_t size, int write) | ||
68 | { | ||
69 | struct page **page_list; | ||
70 | struct ib_umem_chunk *chunk; | ||
71 | unsigned long locked; | ||
72 | unsigned long lock_limit; | ||
73 | unsigned long cur_base; | ||
74 | unsigned long npages; | ||
75 | int ret = 0; | ||
76 | int off; | ||
77 | int i; | ||
78 | |||
79 | if (!can_do_mlock()) | ||
80 | return -EPERM; | ||
81 | |||
82 | page_list = (struct page **) __get_free_page(GFP_KERNEL); | ||
83 | if (!page_list) | ||
84 | return -ENOMEM; | ||
85 | |||
86 | mem->user_base = (unsigned long) addr; | ||
87 | mem->length = size; | ||
88 | mem->offset = (unsigned long) addr & ~PAGE_MASK; | ||
89 | mem->page_size = PAGE_SIZE; | ||
90 | mem->writable = write; | ||
91 | |||
92 | INIT_LIST_HEAD(&mem->chunk_list); | ||
93 | |||
94 | npages = PAGE_ALIGN(size + mem->offset) >> PAGE_SHIFT; | ||
95 | |||
96 | down_write(¤t->mm->mmap_sem); | ||
97 | |||
98 | locked = npages + current->mm->locked_vm; | ||
99 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | ||
100 | |||
101 | if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { | ||
102 | ret = -ENOMEM; | ||
103 | goto out; | ||
104 | } | ||
105 | |||
106 | cur_base = (unsigned long) addr & PAGE_MASK; | ||
107 | |||
108 | while (npages) { | ||
109 | ret = get_user_pages(current, current->mm, cur_base, | ||
110 | min_t(int, npages, | ||
111 | PAGE_SIZE / sizeof (struct page *)), | ||
112 | 1, !write, page_list, NULL); | ||
113 | |||
114 | if (ret < 0) | ||
115 | goto out; | ||
116 | |||
117 | cur_base += ret * PAGE_SIZE; | ||
118 | npages -= ret; | ||
119 | |||
120 | off = 0; | ||
121 | |||
122 | while (ret) { | ||
123 | chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * | ||
124 | min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), | ||
125 | GFP_KERNEL); | ||
126 | if (!chunk) { | ||
127 | ret = -ENOMEM; | ||
128 | goto out; | ||
129 | } | ||
130 | |||
131 | chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); | ||
132 | for (i = 0; i < chunk->nents; ++i) { | ||
133 | chunk->page_list[i].page = page_list[i + off]; | ||
134 | chunk->page_list[i].offset = 0; | ||
135 | chunk->page_list[i].length = PAGE_SIZE; | ||
136 | } | ||
137 | |||
138 | chunk->nmap = dma_map_sg(dev->dma_device, | ||
139 | &chunk->page_list[0], | ||
140 | chunk->nents, | ||
141 | DMA_BIDIRECTIONAL); | ||
142 | if (chunk->nmap <= 0) { | ||
143 | for (i = 0; i < chunk->nents; ++i) | ||
144 | put_page(chunk->page_list[i].page); | ||
145 | kfree(chunk); | ||
146 | |||
147 | ret = -ENOMEM; | ||
148 | goto out; | ||
149 | } | ||
150 | |||
151 | ret -= chunk->nents; | ||
152 | off += chunk->nents; | ||
153 | list_add_tail(&chunk->list, &mem->chunk_list); | ||
154 | } | ||
155 | |||
156 | ret = 0; | ||
157 | } | ||
158 | |||
159 | out: | ||
160 | if (ret < 0) | ||
161 | __ib_umem_release(dev, mem, 0); | ||
162 | else | ||
163 | current->mm->locked_vm = locked; | ||
164 | |||
165 | up_write(¤t->mm->mmap_sem); | ||
166 | free_page((unsigned long) page_list); | ||
167 | |||
168 | return ret; | ||
169 | } | ||
170 | |||
171 | void ib_umem_release(struct ib_device *dev, struct ib_umem *umem) | ||
172 | { | ||
173 | __ib_umem_release(dev, umem, 1); | ||
174 | |||
175 | down_write(¤t->mm->mmap_sem); | ||
176 | current->mm->locked_vm -= | ||
177 | PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; | ||
178 | up_write(¤t->mm->mmap_sem); | ||
179 | } | ||
180 | |||
181 | static void ib_umem_account(void *work_ptr) | ||
182 | { | ||
183 | struct ib_umem_account_work *work = work_ptr; | ||
184 | |||
185 | down_write(&work->mm->mmap_sem); | ||
186 | work->mm->locked_vm -= work->diff; | ||
187 | up_write(&work->mm->mmap_sem); | ||
188 | mmput(work->mm); | ||
189 | kfree(work); | ||
190 | } | ||
191 | |||
192 | void ib_umem_release_on_close(struct ib_device *dev, struct ib_umem *umem) | ||
193 | { | ||
194 | struct ib_umem_account_work *work; | ||
195 | struct mm_struct *mm; | ||
196 | |||
197 | __ib_umem_release(dev, umem, 1); | ||
198 | |||
199 | mm = get_task_mm(current); | ||
200 | if (!mm) | ||
201 | return; | ||
202 | |||
203 | /* | ||
204 | * We may be called with the mm's mmap_sem already held. This | ||
205 | * can happen when a userspace munmap() is the call that drops | ||
206 | * the last reference to our file and calls our release | ||
207 | * method. If there are memory regions to destroy, we'll end | ||
208 | * up here and not be able to take the mmap_sem. Therefore we | ||
209 | * defer the vm_locked accounting to the system workqueue. | ||
210 | */ | ||
211 | |||
212 | work = kmalloc(sizeof *work, GFP_KERNEL); | ||
213 | if (!work) | ||
214 | return; | ||
215 | |||
216 | INIT_WORK(&work->work, ib_umem_account, work); | ||
217 | work->mm = mm; | ||
218 | work->diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; | ||
219 | |||
220 | schedule_work(&work->work); | ||
221 | } | ||