diff options
author | Andy Grover <andy.grover@oracle.com> | 2009-02-24 10:30:26 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-02-27 02:39:28 -0500 |
commit | 7875e18e09961d29f30424c5e2e48e704dc3789b (patch) | |
tree | e0092fa58436f8f221694a94e119642331614c79 /net/rds | |
parent | 3e5048495c8569bfdd552750e0315973c61e7c93 (diff) |
RDS: Message parsing
Parsing of newly-received RDS message headers (including ext.
headers) and copy-to/from-user routines.
page.c implements a per-cpu page remainder cache, to reduce the
number of allocations needed for small datagrams.
Signed-off-by: Andy Grover <andy.grover@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/rds')
-rw-r--r-- | net/rds/message.c | 402 | ||||
-rw-r--r-- | net/rds/page.c | 221 |
2 files changed, 623 insertions, 0 deletions
diff --git a/net/rds/message.c b/net/rds/message.c new file mode 100644 index 000000000000..5a15dc8d0cd7 --- /dev/null +++ b/net/rds/message.c | |||
@@ -0,0 +1,402 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/kernel.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | #include "rdma.h" | ||
37 | |||
38 | static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); | ||
39 | |||
40 | static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { | ||
41 | [RDS_EXTHDR_NONE] = 0, | ||
42 | [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), | ||
43 | [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), | ||
44 | [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), | ||
45 | }; | ||
46 | |||
47 | |||
48 | void rds_message_addref(struct rds_message *rm) | ||
49 | { | ||
50 | rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | ||
51 | atomic_inc(&rm->m_refcount); | ||
52 | } | ||
53 | |||
54 | /* | ||
55 | * This relies on dma_map_sg() not touching sg[].page during merging. | ||
56 | */ | ||
57 | static void rds_message_purge(struct rds_message *rm) | ||
58 | { | ||
59 | unsigned long i; | ||
60 | |||
61 | if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) | ||
62 | return; | ||
63 | |||
64 | for (i = 0; i < rm->m_nents; i++) { | ||
65 | rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); | ||
66 | /* XXX will have to put_page for page refs */ | ||
67 | __free_page(sg_page(&rm->m_sg[i])); | ||
68 | } | ||
69 | rm->m_nents = 0; | ||
70 | |||
71 | if (rm->m_rdma_op) | ||
72 | rds_rdma_free_op(rm->m_rdma_op); | ||
73 | if (rm->m_rdma_mr) | ||
74 | rds_mr_put(rm->m_rdma_mr); | ||
75 | } | ||
76 | |||
77 | void rds_message_inc_purge(struct rds_incoming *inc) | ||
78 | { | ||
79 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
80 | rds_message_purge(rm); | ||
81 | } | ||
82 | |||
83 | void rds_message_put(struct rds_message *rm) | ||
84 | { | ||
85 | rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); | ||
86 | |||
87 | if (atomic_dec_and_test(&rm->m_refcount)) { | ||
88 | BUG_ON(!list_empty(&rm->m_sock_item)); | ||
89 | BUG_ON(!list_empty(&rm->m_conn_item)); | ||
90 | rds_message_purge(rm); | ||
91 | |||
92 | kfree(rm); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void rds_message_inc_free(struct rds_incoming *inc) | ||
97 | { | ||
98 | struct rds_message *rm = container_of(inc, struct rds_message, m_inc); | ||
99 | rds_message_put(rm); | ||
100 | } | ||
101 | |||
102 | void rds_message_populate_header(struct rds_header *hdr, __be16 sport, | ||
103 | __be16 dport, u64 seq) | ||
104 | { | ||
105 | hdr->h_flags = 0; | ||
106 | hdr->h_sport = sport; | ||
107 | hdr->h_dport = dport; | ||
108 | hdr->h_sequence = cpu_to_be64(seq); | ||
109 | hdr->h_exthdr[0] = RDS_EXTHDR_NONE; | ||
110 | } | ||
111 | |||
112 | int rds_message_add_extension(struct rds_header *hdr, | ||
113 | unsigned int type, const void *data, unsigned int len) | ||
114 | { | ||
115 | unsigned int ext_len = sizeof(u8) + len; | ||
116 | unsigned char *dst; | ||
117 | |||
118 | /* For now, refuse to add more than one extension header */ | ||
119 | if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) | ||
120 | return 0; | ||
121 | |||
122 | if (type >= __RDS_EXTHDR_MAX | ||
123 | || len != rds_exthdr_size[type]) | ||
124 | return 0; | ||
125 | |||
126 | if (ext_len >= RDS_HEADER_EXT_SPACE) | ||
127 | return 0; | ||
128 | dst = hdr->h_exthdr; | ||
129 | |||
130 | *dst++ = type; | ||
131 | memcpy(dst, data, len); | ||
132 | |||
133 | dst[len] = RDS_EXTHDR_NONE; | ||
134 | return 1; | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * If a message has extension headers, retrieve them here. | ||
139 | * Call like this: | ||
140 | * | ||
141 | * unsigned int pos = 0; | ||
142 | * | ||
143 | * while (1) { | ||
144 | * buflen = sizeof(buffer); | ||
145 | * type = rds_message_next_extension(hdr, &pos, buffer, &buflen); | ||
146 | * if (type == RDS_EXTHDR_NONE) | ||
147 | * break; | ||
148 | * ... | ||
149 | * } | ||
150 | */ | ||
151 | int rds_message_next_extension(struct rds_header *hdr, | ||
152 | unsigned int *pos, void *buf, unsigned int *buflen) | ||
153 | { | ||
154 | unsigned int offset, ext_type, ext_len; | ||
155 | u8 *src = hdr->h_exthdr; | ||
156 | |||
157 | offset = *pos; | ||
158 | if (offset >= RDS_HEADER_EXT_SPACE) | ||
159 | goto none; | ||
160 | |||
161 | /* Get the extension type and length. For now, the | ||
162 | * length is implied by the extension type. */ | ||
163 | ext_type = src[offset++]; | ||
164 | |||
165 | if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX) | ||
166 | goto none; | ||
167 | ext_len = rds_exthdr_size[ext_type]; | ||
168 | if (offset + ext_len > RDS_HEADER_EXT_SPACE) | ||
169 | goto none; | ||
170 | |||
171 | *pos = offset + ext_len; | ||
172 | if (ext_len < *buflen) | ||
173 | *buflen = ext_len; | ||
174 | memcpy(buf, src + offset, *buflen); | ||
175 | return ext_type; | ||
176 | |||
177 | none: | ||
178 | *pos = RDS_HEADER_EXT_SPACE; | ||
179 | *buflen = 0; | ||
180 | return RDS_EXTHDR_NONE; | ||
181 | } | ||
182 | |||
183 | int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) | ||
184 | { | ||
185 | struct rds_ext_header_version ext_hdr; | ||
186 | |||
187 | ext_hdr.h_version = cpu_to_be32(version); | ||
188 | return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); | ||
189 | } | ||
190 | |||
191 | int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) | ||
192 | { | ||
193 | struct rds_ext_header_version ext_hdr; | ||
194 | unsigned int pos = 0, len = sizeof(ext_hdr); | ||
195 | |||
196 | /* We assume the version extension is the only one present */ | ||
197 | if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) | ||
198 | return 0; | ||
199 | *version = be32_to_cpu(ext_hdr.h_version); | ||
200 | return 1; | ||
201 | } | ||
202 | |||
203 | int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) | ||
204 | { | ||
205 | struct rds_ext_header_rdma_dest ext_hdr; | ||
206 | |||
207 | ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); | ||
208 | ext_hdr.h_rdma_offset = cpu_to_be32(offset); | ||
209 | return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); | ||
210 | } | ||
211 | |||
212 | struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) | ||
213 | { | ||
214 | struct rds_message *rm; | ||
215 | |||
216 | rm = kzalloc(sizeof(struct rds_message) + | ||
217 | (nents * sizeof(struct scatterlist)), gfp); | ||
218 | if (!rm) | ||
219 | goto out; | ||
220 | |||
221 | if (nents) | ||
222 | sg_init_table(rm->m_sg, nents); | ||
223 | atomic_set(&rm->m_refcount, 1); | ||
224 | INIT_LIST_HEAD(&rm->m_sock_item); | ||
225 | INIT_LIST_HEAD(&rm->m_conn_item); | ||
226 | spin_lock_init(&rm->m_rs_lock); | ||
227 | |||
228 | out: | ||
229 | return rm; | ||
230 | } | ||
231 | |||
232 | struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) | ||
233 | { | ||
234 | struct rds_message *rm; | ||
235 | unsigned int i; | ||
236 | |||
237 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
238 | if (rm == NULL) | ||
239 | return ERR_PTR(-ENOMEM); | ||
240 | |||
241 | set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); | ||
242 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | ||
243 | rm->m_nents = ceil(total_len, PAGE_SIZE); | ||
244 | |||
245 | for (i = 0; i < rm->m_nents; ++i) { | ||
246 | sg_set_page(&rm->m_sg[i], | ||
247 | virt_to_page(page_addrs[i]), | ||
248 | PAGE_SIZE, 0); | ||
249 | } | ||
250 | |||
251 | return rm; | ||
252 | } | ||
253 | |||
254 | struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, | ||
255 | size_t total_len) | ||
256 | { | ||
257 | unsigned long to_copy; | ||
258 | unsigned long iov_off; | ||
259 | unsigned long sg_off; | ||
260 | struct rds_message *rm; | ||
261 | struct iovec *iov; | ||
262 | struct scatterlist *sg; | ||
263 | int ret; | ||
264 | |||
265 | rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); | ||
266 | if (rm == NULL) { | ||
267 | ret = -ENOMEM; | ||
268 | goto out; | ||
269 | } | ||
270 | |||
271 | rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); | ||
272 | |||
273 | /* | ||
274 | * now allocate and copy in the data payload. | ||
275 | */ | ||
276 | sg = rm->m_sg; | ||
277 | iov = first_iov; | ||
278 | iov_off = 0; | ||
279 | sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ | ||
280 | |||
281 | while (total_len) { | ||
282 | if (sg_page(sg) == NULL) { | ||
283 | ret = rds_page_remainder_alloc(sg, total_len, | ||
284 | GFP_HIGHUSER); | ||
285 | if (ret) | ||
286 | goto out; | ||
287 | rm->m_nents++; | ||
288 | sg_off = 0; | ||
289 | } | ||
290 | |||
291 | while (iov_off == iov->iov_len) { | ||
292 | iov_off = 0; | ||
293 | iov++; | ||
294 | } | ||
295 | |||
296 | to_copy = min(iov->iov_len - iov_off, sg->length - sg_off); | ||
297 | to_copy = min_t(size_t, to_copy, total_len); | ||
298 | |||
299 | rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to " | ||
300 | "sg [%p, %u, %u] + %lu\n", | ||
301 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
302 | (void *)sg_page(sg), sg->offset, sg->length, sg_off); | ||
303 | |||
304 | ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off, | ||
305 | iov->iov_base + iov_off, | ||
306 | to_copy); | ||
307 | if (ret) | ||
308 | goto out; | ||
309 | |||
310 | iov_off += to_copy; | ||
311 | total_len -= to_copy; | ||
312 | sg_off += to_copy; | ||
313 | |||
314 | if (sg_off == sg->length) | ||
315 | sg++; | ||
316 | } | ||
317 | |||
318 | ret = 0; | ||
319 | out: | ||
320 | if (ret) { | ||
321 | if (rm) | ||
322 | rds_message_put(rm); | ||
323 | rm = ERR_PTR(ret); | ||
324 | } | ||
325 | return rm; | ||
326 | } | ||
327 | |||
328 | int rds_message_inc_copy_to_user(struct rds_incoming *inc, | ||
329 | struct iovec *first_iov, size_t size) | ||
330 | { | ||
331 | struct rds_message *rm; | ||
332 | struct iovec *iov; | ||
333 | struct scatterlist *sg; | ||
334 | unsigned long to_copy; | ||
335 | unsigned long iov_off; | ||
336 | unsigned long vec_off; | ||
337 | int copied; | ||
338 | int ret; | ||
339 | u32 len; | ||
340 | |||
341 | rm = container_of(inc, struct rds_message, m_inc); | ||
342 | len = be32_to_cpu(rm->m_inc.i_hdr.h_len); | ||
343 | |||
344 | iov = first_iov; | ||
345 | iov_off = 0; | ||
346 | sg = rm->m_sg; | ||
347 | vec_off = 0; | ||
348 | copied = 0; | ||
349 | |||
350 | while (copied < size && copied < len) { | ||
351 | while (iov_off == iov->iov_len) { | ||
352 | iov_off = 0; | ||
353 | iov++; | ||
354 | } | ||
355 | |||
356 | to_copy = min(iov->iov_len - iov_off, sg->length - vec_off); | ||
357 | to_copy = min_t(size_t, to_copy, size - copied); | ||
358 | to_copy = min_t(unsigned long, to_copy, len - copied); | ||
359 | |||
360 | rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to " | ||
361 | "sg [%p, %u, %u] + %lu\n", | ||
362 | to_copy, iov->iov_base, iov->iov_len, iov_off, | ||
363 | sg_page(sg), sg->offset, sg->length, vec_off); | ||
364 | |||
365 | ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off, | ||
366 | iov->iov_base + iov_off, | ||
367 | to_copy); | ||
368 | if (ret) { | ||
369 | copied = ret; | ||
370 | break; | ||
371 | } | ||
372 | |||
373 | iov_off += to_copy; | ||
374 | vec_off += to_copy; | ||
375 | copied += to_copy; | ||
376 | |||
377 | if (vec_off == sg->length) { | ||
378 | vec_off = 0; | ||
379 | sg++; | ||
380 | } | ||
381 | } | ||
382 | |||
383 | return copied; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * If the message is still on the send queue, wait until the transport | ||
388 | * is done with it. This is particularly important for RDMA operations. | ||
389 | */ | ||
390 | void rds_message_wait(struct rds_message *rm) | ||
391 | { | ||
392 | wait_event(rds_message_flush_waitq, | ||
393 | !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); | ||
394 | } | ||
395 | |||
396 | void rds_message_unmapped(struct rds_message *rm) | ||
397 | { | ||
398 | clear_bit(RDS_MSG_MAPPED, &rm->m_flags); | ||
399 | if (waitqueue_active(&rds_message_flush_waitq)) | ||
400 | wake_up(&rds_message_flush_waitq); | ||
401 | } | ||
402 | |||
diff --git a/net/rds/page.c b/net/rds/page.c new file mode 100644 index 000000000000..c460743a89ad --- /dev/null +++ b/net/rds/page.c | |||
@@ -0,0 +1,221 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/highmem.h> | ||
34 | |||
35 | #include "rds.h" | ||
36 | |||
37 | struct rds_page_remainder { | ||
38 | struct page *r_page; | ||
39 | unsigned long r_offset; | ||
40 | }; | ||
41 | |||
42 | DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; | ||
43 | |||
44 | /* | ||
45 | * returns 0 on success or -errno on failure. | ||
46 | * | ||
47 | * We don't have to worry about flush_dcache_page() as this only works | ||
48 | * with private pages. If, say, we were to do directed receive to pinned | ||
49 | * user pages we'd have to worry more about cache coherence. (Though | ||
50 | * the flush_dcache_page() in get_user_pages() would probably be enough). | ||
51 | */ | ||
52 | int rds_page_copy_user(struct page *page, unsigned long offset, | ||
53 | void __user *ptr, unsigned long bytes, | ||
54 | int to_user) | ||
55 | { | ||
56 | unsigned long ret; | ||
57 | void *addr; | ||
58 | |||
59 | if (to_user) | ||
60 | rds_stats_add(s_copy_to_user, bytes); | ||
61 | else | ||
62 | rds_stats_add(s_copy_from_user, bytes); | ||
63 | |||
64 | addr = kmap_atomic(page, KM_USER0); | ||
65 | if (to_user) | ||
66 | ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); | ||
67 | else | ||
68 | ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); | ||
69 | kunmap_atomic(addr, KM_USER0); | ||
70 | |||
71 | if (ret) { | ||
72 | addr = kmap(page); | ||
73 | if (to_user) | ||
74 | ret = copy_to_user(ptr, addr + offset, bytes); | ||
75 | else | ||
76 | ret = copy_from_user(addr + offset, ptr, bytes); | ||
77 | kunmap(page); | ||
78 | if (ret) | ||
79 | return -EFAULT; | ||
80 | } | ||
81 | |||
82 | return 0; | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * Message allocation uses this to build up regions of a message. | ||
87 | * | ||
88 | * @bytes - the number of bytes needed. | ||
89 | * @gfp - the waiting behaviour of the allocation | ||
90 | * | ||
91 | * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to | ||
92 | * kmap the pages, etc. | ||
93 | * | ||
94 | * If @bytes is at least a full page then this just returns a page from | ||
95 | * alloc_page(). | ||
96 | * | ||
97 | * If @bytes is a partial page then this stores the unused region of the | ||
98 | * page in a per-cpu structure. Future partial-page allocations may be | ||
99 | * satisfied from that cached region. This lets us waste less memory on | ||
100 | * small allocations with minimal complexity. It works because the transmit | ||
101 | * path passes read-only page regions down to devices. They hold a page | ||
102 | * reference until they are done with the region. | ||
103 | */ | ||
104 | int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, | ||
105 | gfp_t gfp) | ||
106 | { | ||
107 | struct rds_page_remainder *rem; | ||
108 | unsigned long flags; | ||
109 | struct page *page; | ||
110 | int ret; | ||
111 | |||
112 | gfp |= __GFP_HIGHMEM; | ||
113 | |||
114 | /* jump straight to allocation if we're trying for a huge page */ | ||
115 | if (bytes >= PAGE_SIZE) { | ||
116 | page = alloc_page(gfp); | ||
117 | if (page == NULL) { | ||
118 | ret = -ENOMEM; | ||
119 | } else { | ||
120 | sg_set_page(scat, page, PAGE_SIZE, 0); | ||
121 | ret = 0; | ||
122 | } | ||
123 | goto out; | ||
124 | } | ||
125 | |||
126 | rem = &per_cpu(rds_page_remainders, get_cpu()); | ||
127 | local_irq_save(flags); | ||
128 | |||
129 | while (1) { | ||
130 | /* avoid a tiny region getting stuck by tossing it */ | ||
131 | if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { | ||
132 | rds_stats_inc(s_page_remainder_miss); | ||
133 | __free_page(rem->r_page); | ||
134 | rem->r_page = NULL; | ||
135 | } | ||
136 | |||
137 | /* hand out a fragment from the cached page */ | ||
138 | if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { | ||
139 | sg_set_page(scat, rem->r_page, bytes, rem->r_offset); | ||
140 | get_page(sg_page(scat)); | ||
141 | |||
142 | if (rem->r_offset != 0) | ||
143 | rds_stats_inc(s_page_remainder_hit); | ||
144 | |||
145 | rem->r_offset += bytes; | ||
146 | if (rem->r_offset == PAGE_SIZE) { | ||
147 | __free_page(rem->r_page); | ||
148 | rem->r_page = NULL; | ||
149 | } | ||
150 | ret = 0; | ||
151 | break; | ||
152 | } | ||
153 | |||
154 | /* alloc if there is nothing for us to use */ | ||
155 | local_irq_restore(flags); | ||
156 | put_cpu(); | ||
157 | |||
158 | page = alloc_page(gfp); | ||
159 | |||
160 | rem = &per_cpu(rds_page_remainders, get_cpu()); | ||
161 | local_irq_save(flags); | ||
162 | |||
163 | if (page == NULL) { | ||
164 | ret = -ENOMEM; | ||
165 | break; | ||
166 | } | ||
167 | |||
168 | /* did someone race to fill the remainder before us? */ | ||
169 | if (rem->r_page) { | ||
170 | __free_page(page); | ||
171 | continue; | ||
172 | } | ||
173 | |||
174 | /* otherwise install our page and loop around to alloc */ | ||
175 | rem->r_page = page; | ||
176 | rem->r_offset = 0; | ||
177 | } | ||
178 | |||
179 | local_irq_restore(flags); | ||
180 | put_cpu(); | ||
181 | out: | ||
182 | rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, | ||
183 | ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, | ||
184 | ret ? 0 : scat->length); | ||
185 | return ret; | ||
186 | } | ||
187 | |||
188 | static int rds_page_remainder_cpu_notify(struct notifier_block *self, | ||
189 | unsigned long action, void *hcpu) | ||
190 | { | ||
191 | struct rds_page_remainder *rem; | ||
192 | long cpu = (long)hcpu; | ||
193 | |||
194 | rem = &per_cpu(rds_page_remainders, cpu); | ||
195 | |||
196 | rdsdebug("cpu %ld action 0x%lx\n", cpu, action); | ||
197 | |||
198 | switch (action) { | ||
199 | case CPU_DEAD: | ||
200 | if (rem->r_page) | ||
201 | __free_page(rem->r_page); | ||
202 | rem->r_page = NULL; | ||
203 | break; | ||
204 | } | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static struct notifier_block rds_page_remainder_nb = { | ||
210 | .notifier_call = rds_page_remainder_cpu_notify, | ||
211 | }; | ||
212 | |||
213 | void rds_page_exit(void) | ||
214 | { | ||
215 | int i; | ||
216 | |||
217 | for_each_possible_cpu(i) | ||
218 | rds_page_remainder_cpu_notify(&rds_page_remainder_nb, | ||
219 | (unsigned long)CPU_DEAD, | ||
220 | (void *)(long)i); | ||
221 | } | ||