diff options
Diffstat (limited to 'fs/orangefs/orangefs-bufmap.c')
-rw-r--r-- | fs/orangefs/orangefs-bufmap.c | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c new file mode 100644 index 000000000000..1f8acc9f9a88 --- /dev/null +++ b/fs/orangefs/orangefs-bufmap.c | |||
@@ -0,0 +1,556 @@ | |||
1 | /* | ||
2 | * (C) 2001 Clemson University and The University of Chicago | ||
3 | * | ||
4 | * See COPYING in top-level directory. | ||
5 | */ | ||
6 | #include "protocol.h" | ||
7 | #include "orangefs-kernel.h" | ||
8 | #include "orangefs-bufmap.h" | ||
9 | |||
10 | struct slot_map { | ||
11 | int c; | ||
12 | wait_queue_head_t q; | ||
13 | int count; | ||
14 | unsigned long *map; | ||
15 | }; | ||
16 | |||
17 | static struct slot_map rw_map = { | ||
18 | .c = -1, | ||
19 | .q = __WAIT_QUEUE_HEAD_INITIALIZER(rw_map.q) | ||
20 | }; | ||
21 | static struct slot_map readdir_map = { | ||
22 | .c = -1, | ||
23 | .q = __WAIT_QUEUE_HEAD_INITIALIZER(readdir_map.q) | ||
24 | }; | ||
25 | |||
26 | |||
27 | static void install(struct slot_map *m, int count, unsigned long *map) | ||
28 | { | ||
29 | spin_lock(&m->q.lock); | ||
30 | m->c = m->count = count; | ||
31 | m->map = map; | ||
32 | wake_up_all_locked(&m->q); | ||
33 | spin_unlock(&m->q.lock); | ||
34 | } | ||
35 | |||
36 | static void mark_killed(struct slot_map *m) | ||
37 | { | ||
38 | spin_lock(&m->q.lock); | ||
39 | m->c -= m->count + 1; | ||
40 | spin_unlock(&m->q.lock); | ||
41 | } | ||
42 | |||
43 | static void run_down(struct slot_map *m) | ||
44 | { | ||
45 | DEFINE_WAIT(wait); | ||
46 | spin_lock(&m->q.lock); | ||
47 | if (m->c != -1) { | ||
48 | for (;;) { | ||
49 | if (likely(list_empty(&wait.task_list))) | ||
50 | __add_wait_queue_tail(&m->q, &wait); | ||
51 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
52 | |||
53 | if (m->c == -1) | ||
54 | break; | ||
55 | |||
56 | spin_unlock(&m->q.lock); | ||
57 | schedule(); | ||
58 | spin_lock(&m->q.lock); | ||
59 | } | ||
60 | __remove_wait_queue(&m->q, &wait); | ||
61 | __set_current_state(TASK_RUNNING); | ||
62 | } | ||
63 | m->map = NULL; | ||
64 | spin_unlock(&m->q.lock); | ||
65 | } | ||
66 | |||
67 | static void put(struct slot_map *m, int slot) | ||
68 | { | ||
69 | int v; | ||
70 | spin_lock(&m->q.lock); | ||
71 | __clear_bit(slot, m->map); | ||
72 | v = ++m->c; | ||
73 | if (unlikely(v == 1)) /* no free slots -> one free slot */ | ||
74 | wake_up_locked(&m->q); | ||
75 | else if (unlikely(v == -1)) /* finished dying */ | ||
76 | wake_up_all_locked(&m->q); | ||
77 | spin_unlock(&m->q.lock); | ||
78 | } | ||
79 | |||
80 | static int wait_for_free(struct slot_map *m) | ||
81 | { | ||
82 | long left = slot_timeout_secs * HZ; | ||
83 | DEFINE_WAIT(wait); | ||
84 | |||
85 | do { | ||
86 | long n = left, t; | ||
87 | if (likely(list_empty(&wait.task_list))) | ||
88 | __add_wait_queue_tail_exclusive(&m->q, &wait); | ||
89 | set_current_state(TASK_INTERRUPTIBLE); | ||
90 | |||
91 | if (m->c > 0) | ||
92 | break; | ||
93 | |||
94 | if (m->c < 0) { | ||
95 | /* we are waiting for map to be installed */ | ||
96 | /* it would better be there soon, or we go away */ | ||
97 | if (n > ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ) | ||
98 | n = ORANGEFS_BUFMAP_WAIT_TIMEOUT_SECS * HZ; | ||
99 | } | ||
100 | spin_unlock(&m->q.lock); | ||
101 | t = schedule_timeout(n); | ||
102 | spin_lock(&m->q.lock); | ||
103 | if (unlikely(!t) && n != left && m->c < 0) | ||
104 | left = t; | ||
105 | else | ||
106 | left = t + (left - n); | ||
107 | if (unlikely(signal_pending(current))) | ||
108 | left = -EINTR; | ||
109 | } while (left > 0); | ||
110 | |||
111 | if (!list_empty(&wait.task_list)) | ||
112 | list_del(&wait.task_list); | ||
113 | else if (left <= 0 && waitqueue_active(&m->q)) | ||
114 | __wake_up_locked_key(&m->q, TASK_INTERRUPTIBLE, NULL); | ||
115 | __set_current_state(TASK_RUNNING); | ||
116 | |||
117 | if (likely(left > 0)) | ||
118 | return 0; | ||
119 | |||
120 | return left < 0 ? -EINTR : -ETIMEDOUT; | ||
121 | } | ||
122 | |||
123 | static int get(struct slot_map *m) | ||
124 | { | ||
125 | int res = 0; | ||
126 | spin_lock(&m->q.lock); | ||
127 | if (unlikely(m->c <= 0)) | ||
128 | res = wait_for_free(m); | ||
129 | if (likely(!res)) { | ||
130 | m->c--; | ||
131 | res = find_first_zero_bit(m->map, m->count); | ||
132 | __set_bit(res, m->map); | ||
133 | } | ||
134 | spin_unlock(&m->q.lock); | ||
135 | return res; | ||
136 | } | ||
137 | |||
138 | /* used to describe mapped buffers */ | ||
139 | struct orangefs_bufmap_desc { | ||
140 | void *uaddr; /* user space address pointer */ | ||
141 | struct page **page_array; /* array of mapped pages */ | ||
142 | int array_count; /* size of above arrays */ | ||
143 | struct list_head list_link; | ||
144 | }; | ||
145 | |||
146 | static struct orangefs_bufmap { | ||
147 | int desc_size; | ||
148 | int desc_shift; | ||
149 | int desc_count; | ||
150 | int total_size; | ||
151 | int page_count; | ||
152 | |||
153 | struct page **page_array; | ||
154 | struct orangefs_bufmap_desc *desc_array; | ||
155 | |||
156 | /* array to track usage of buffer descriptors */ | ||
157 | unsigned long *buffer_index_array; | ||
158 | |||
159 | /* array to track usage of buffer descriptors for readdir */ | ||
160 | #define N DIV_ROUND_UP(ORANGEFS_READDIR_DEFAULT_DESC_COUNT, BITS_PER_LONG) | ||
161 | unsigned long readdir_index_array[N]; | ||
162 | #undef N | ||
163 | } *__orangefs_bufmap; | ||
164 | |||
165 | static DEFINE_SPINLOCK(orangefs_bufmap_lock); | ||
166 | |||
167 | static void | ||
168 | orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap) | ||
169 | { | ||
170 | int i; | ||
171 | |||
172 | for (i = 0; i < bufmap->page_count; i++) | ||
173 | page_cache_release(bufmap->page_array[i]); | ||
174 | } | ||
175 | |||
176 | static void | ||
177 | orangefs_bufmap_free(struct orangefs_bufmap *bufmap) | ||
178 | { | ||
179 | kfree(bufmap->page_array); | ||
180 | kfree(bufmap->desc_array); | ||
181 | kfree(bufmap->buffer_index_array); | ||
182 | kfree(bufmap); | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * XXX: Can the size and shift change while the caller gives up the | ||
187 | * XXX: lock between calling this and doing something useful? | ||
188 | */ | ||
189 | |||
190 | int orangefs_bufmap_size_query(void) | ||
191 | { | ||
192 | struct orangefs_bufmap *bufmap; | ||
193 | int size = 0; | ||
194 | spin_lock(&orangefs_bufmap_lock); | ||
195 | bufmap = __orangefs_bufmap; | ||
196 | if (bufmap) | ||
197 | size = bufmap->desc_size; | ||
198 | spin_unlock(&orangefs_bufmap_lock); | ||
199 | return size; | ||
200 | } | ||
201 | |||
202 | int orangefs_bufmap_shift_query(void) | ||
203 | { | ||
204 | struct orangefs_bufmap *bufmap; | ||
205 | int shift = 0; | ||
206 | spin_lock(&orangefs_bufmap_lock); | ||
207 | bufmap = __orangefs_bufmap; | ||
208 | if (bufmap) | ||
209 | shift = bufmap->desc_shift; | ||
210 | spin_unlock(&orangefs_bufmap_lock); | ||
211 | return shift; | ||
212 | } | ||
213 | |||
214 | static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); | ||
215 | static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); | ||
216 | |||
217 | /* | ||
218 | * orangefs_get_bufmap_init | ||
219 | * | ||
220 | * If bufmap_init is 1, then the shared memory system, including the | ||
221 | * buffer_index_array, is available. Otherwise, it is not. | ||
222 | * | ||
223 | * returns the value of bufmap_init | ||
224 | */ | ||
225 | int orangefs_get_bufmap_init(void) | ||
226 | { | ||
227 | return __orangefs_bufmap ? 1 : 0; | ||
228 | } | ||
229 | |||
230 | |||
231 | static struct orangefs_bufmap * | ||
232 | orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) | ||
233 | { | ||
234 | struct orangefs_bufmap *bufmap; | ||
235 | |||
236 | bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL); | ||
237 | if (!bufmap) | ||
238 | goto out; | ||
239 | |||
240 | bufmap->total_size = user_desc->total_size; | ||
241 | bufmap->desc_count = user_desc->count; | ||
242 | bufmap->desc_size = user_desc->size; | ||
243 | bufmap->desc_shift = ilog2(bufmap->desc_size); | ||
244 | |||
245 | bufmap->buffer_index_array = | ||
246 | kzalloc(DIV_ROUND_UP(bufmap->desc_count, BITS_PER_LONG), GFP_KERNEL); | ||
247 | if (!bufmap->buffer_index_array) { | ||
248 | gossip_err("orangefs: could not allocate %d buffer indices\n", | ||
249 | bufmap->desc_count); | ||
250 | goto out_free_bufmap; | ||
251 | } | ||
252 | |||
253 | bufmap->desc_array = | ||
254 | kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc), | ||
255 | GFP_KERNEL); | ||
256 | if (!bufmap->desc_array) { | ||
257 | gossip_err("orangefs: could not allocate %d descriptors\n", | ||
258 | bufmap->desc_count); | ||
259 | goto out_free_index_array; | ||
260 | } | ||
261 | |||
262 | bufmap->page_count = bufmap->total_size / PAGE_SIZE; | ||
263 | |||
264 | /* allocate storage to track our page mappings */ | ||
265 | bufmap->page_array = | ||
266 | kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL); | ||
267 | if (!bufmap->page_array) | ||
268 | goto out_free_desc_array; | ||
269 | |||
270 | return bufmap; | ||
271 | |||
272 | out_free_desc_array: | ||
273 | kfree(bufmap->desc_array); | ||
274 | out_free_index_array: | ||
275 | kfree(bufmap->buffer_index_array); | ||
276 | out_free_bufmap: | ||
277 | kfree(bufmap); | ||
278 | out: | ||
279 | return NULL; | ||
280 | } | ||
281 | |||
282 | static int | ||
283 | orangefs_bufmap_map(struct orangefs_bufmap *bufmap, | ||
284 | struct ORANGEFS_dev_map_desc *user_desc) | ||
285 | { | ||
286 | int pages_per_desc = bufmap->desc_size / PAGE_SIZE; | ||
287 | int offset = 0, ret, i; | ||
288 | |||
289 | /* map the pages */ | ||
290 | ret = get_user_pages_fast((unsigned long)user_desc->ptr, | ||
291 | bufmap->page_count, 1, bufmap->page_array); | ||
292 | |||
293 | if (ret < 0) | ||
294 | return ret; | ||
295 | |||
296 | if (ret != bufmap->page_count) { | ||
297 | gossip_err("orangefs error: asked for %d pages, only got %d.\n", | ||
298 | bufmap->page_count, ret); | ||
299 | |||
300 | for (i = 0; i < ret; i++) { | ||
301 | SetPageError(bufmap->page_array[i]); | ||
302 | page_cache_release(bufmap->page_array[i]); | ||
303 | } | ||
304 | return -ENOMEM; | ||
305 | } | ||
306 | |||
307 | /* | ||
308 | * ideally we want to get kernel space pointers for each page, but | ||
309 | * we can't kmap that many pages at once if highmem is being used. | ||
310 | * so instead, we just kmap/kunmap the page address each time the | ||
311 | * kaddr is needed. | ||
312 | */ | ||
313 | for (i = 0; i < bufmap->page_count; i++) | ||
314 | flush_dcache_page(bufmap->page_array[i]); | ||
315 | |||
316 | /* build a list of available descriptors */ | ||
317 | for (offset = 0, i = 0; i < bufmap->desc_count; i++) { | ||
318 | bufmap->desc_array[i].page_array = &bufmap->page_array[offset]; | ||
319 | bufmap->desc_array[i].array_count = pages_per_desc; | ||
320 | bufmap->desc_array[i].uaddr = | ||
321 | (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE)); | ||
322 | offset += pages_per_desc; | ||
323 | } | ||
324 | |||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * orangefs_bufmap_initialize() | ||
330 | * | ||
331 | * initializes the mapped buffer interface | ||
332 | * | ||
333 | * returns 0 on success, -errno on failure | ||
334 | */ | ||
335 | int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc) | ||
336 | { | ||
337 | struct orangefs_bufmap *bufmap; | ||
338 | int ret = -EINVAL; | ||
339 | |||
340 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | ||
341 | "orangefs_bufmap_initialize: called (ptr (" | ||
342 | "%p) sz (%d) cnt(%d).\n", | ||
343 | user_desc->ptr, | ||
344 | user_desc->size, | ||
345 | user_desc->count); | ||
346 | |||
347 | /* | ||
348 | * sanity check alignment and size of buffer that caller wants to | ||
349 | * work with | ||
350 | */ | ||
351 | if (PAGE_ALIGN((unsigned long)user_desc->ptr) != | ||
352 | (unsigned long)user_desc->ptr) { | ||
353 | gossip_err("orangefs error: memory alignment (front). %p\n", | ||
354 | user_desc->ptr); | ||
355 | goto out; | ||
356 | } | ||
357 | |||
358 | if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size)) | ||
359 | != (unsigned long)(user_desc->ptr + user_desc->total_size)) { | ||
360 | gossip_err("orangefs error: memory alignment (back).(%p + %d)\n", | ||
361 | user_desc->ptr, | ||
362 | user_desc->total_size); | ||
363 | goto out; | ||
364 | } | ||
365 | |||
366 | if (user_desc->total_size != (user_desc->size * user_desc->count)) { | ||
367 | gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n", | ||
368 | user_desc->total_size, | ||
369 | user_desc->size, | ||
370 | user_desc->count); | ||
371 | goto out; | ||
372 | } | ||
373 | |||
374 | if ((user_desc->size % PAGE_SIZE) != 0) { | ||
375 | gossip_err("orangefs error: bufmap size not page size divisible (%d).\n", | ||
376 | user_desc->size); | ||
377 | goto out; | ||
378 | } | ||
379 | |||
380 | ret = -ENOMEM; | ||
381 | bufmap = orangefs_bufmap_alloc(user_desc); | ||
382 | if (!bufmap) | ||
383 | goto out; | ||
384 | |||
385 | ret = orangefs_bufmap_map(bufmap, user_desc); | ||
386 | if (ret) | ||
387 | goto out_free_bufmap; | ||
388 | |||
389 | |||
390 | spin_lock(&orangefs_bufmap_lock); | ||
391 | if (__orangefs_bufmap) { | ||
392 | spin_unlock(&orangefs_bufmap_lock); | ||
393 | gossip_err("orangefs: error: bufmap already initialized.\n"); | ||
394 | ret = -EINVAL; | ||
395 | goto out_unmap_bufmap; | ||
396 | } | ||
397 | __orangefs_bufmap = bufmap; | ||
398 | install(&rw_map, | ||
399 | bufmap->desc_count, | ||
400 | bufmap->buffer_index_array); | ||
401 | install(&readdir_map, | ||
402 | ORANGEFS_READDIR_DEFAULT_DESC_COUNT, | ||
403 | bufmap->readdir_index_array); | ||
404 | spin_unlock(&orangefs_bufmap_lock); | ||
405 | |||
406 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | ||
407 | "orangefs_bufmap_initialize: exiting normally\n"); | ||
408 | return 0; | ||
409 | |||
410 | out_unmap_bufmap: | ||
411 | orangefs_bufmap_unmap(bufmap); | ||
412 | out_free_bufmap: | ||
413 | orangefs_bufmap_free(bufmap); | ||
414 | out: | ||
415 | return ret; | ||
416 | } | ||
417 | |||
418 | /* | ||
419 | * orangefs_bufmap_finalize() | ||
420 | * | ||
421 | * shuts down the mapped buffer interface and releases any resources | ||
422 | * associated with it | ||
423 | * | ||
424 | * no return value | ||
425 | */ | ||
426 | void orangefs_bufmap_finalize(void) | ||
427 | { | ||
428 | struct orangefs_bufmap *bufmap = __orangefs_bufmap; | ||
429 | if (!bufmap) | ||
430 | return; | ||
431 | gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n"); | ||
432 | mark_killed(&rw_map); | ||
433 | mark_killed(&readdir_map); | ||
434 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | ||
435 | "orangefs_bufmap_finalize: exiting normally\n"); | ||
436 | } | ||
437 | |||
438 | void orangefs_bufmap_run_down(void) | ||
439 | { | ||
440 | struct orangefs_bufmap *bufmap = __orangefs_bufmap; | ||
441 | if (!bufmap) | ||
442 | return; | ||
443 | run_down(&rw_map); | ||
444 | run_down(&readdir_map); | ||
445 | spin_lock(&orangefs_bufmap_lock); | ||
446 | __orangefs_bufmap = NULL; | ||
447 | spin_unlock(&orangefs_bufmap_lock); | ||
448 | orangefs_bufmap_unmap(bufmap); | ||
449 | orangefs_bufmap_free(bufmap); | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * orangefs_bufmap_get() | ||
454 | * | ||
455 | * gets a free mapped buffer descriptor, will sleep until one becomes | ||
456 | * available if necessary | ||
457 | * | ||
458 | * returns slot on success, -errno on failure | ||
459 | */ | ||
460 | int orangefs_bufmap_get(void) | ||
461 | { | ||
462 | return get(&rw_map); | ||
463 | } | ||
464 | |||
465 | /* | ||
466 | * orangefs_bufmap_put() | ||
467 | * | ||
468 | * returns a mapped buffer descriptor to the collection | ||
469 | * | ||
470 | * no return value | ||
471 | */ | ||
472 | void orangefs_bufmap_put(int buffer_index) | ||
473 | { | ||
474 | put(&rw_map, buffer_index); | ||
475 | } | ||
476 | |||
477 | /* | ||
478 | * orangefs_readdir_index_get() | ||
479 | * | ||
480 | * gets a free descriptor, will sleep until one becomes | ||
481 | * available if necessary. | ||
482 | * Although the readdir buffers are not mapped into kernel space | ||
483 | * we could do that at a later point of time. Regardless, these | ||
484 | * indices are used by the client-core. | ||
485 | * | ||
486 | * returns slot on success, -errno on failure | ||
487 | */ | ||
488 | int orangefs_readdir_index_get(void) | ||
489 | { | ||
490 | return get(&readdir_map); | ||
491 | } | ||
492 | |||
493 | void orangefs_readdir_index_put(int buffer_index) | ||
494 | { | ||
495 | put(&readdir_map, buffer_index); | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * we've been handed an iovec, we need to copy it to | ||
500 | * the shared memory descriptor at "buffer_index". | ||
501 | */ | ||
502 | int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, | ||
503 | int buffer_index, | ||
504 | size_t size) | ||
505 | { | ||
506 | struct orangefs_bufmap_desc *to; | ||
507 | int i; | ||
508 | |||
509 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | ||
510 | "%s: buffer_index:%d: size:%zu:\n", | ||
511 | __func__, buffer_index, size); | ||
512 | |||
513 | to = &__orangefs_bufmap->desc_array[buffer_index]; | ||
514 | for (i = 0; size; i++) { | ||
515 | struct page *page = to->page_array[i]; | ||
516 | size_t n = size; | ||
517 | if (n > PAGE_SIZE) | ||
518 | n = PAGE_SIZE; | ||
519 | n = copy_page_from_iter(page, 0, n, iter); | ||
520 | if (!n) | ||
521 | return -EFAULT; | ||
522 | size -= n; | ||
523 | } | ||
524 | return 0; | ||
525 | |||
526 | } | ||
527 | |||
528 | /* | ||
529 | * we've been handed an iovec, we need to fill it from | ||
530 | * the shared memory descriptor at "buffer_index". | ||
531 | */ | ||
532 | int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, | ||
533 | int buffer_index, | ||
534 | size_t size) | ||
535 | { | ||
536 | struct orangefs_bufmap_desc *from; | ||
537 | int i; | ||
538 | |||
539 | from = &__orangefs_bufmap->desc_array[buffer_index]; | ||
540 | gossip_debug(GOSSIP_BUFMAP_DEBUG, | ||
541 | "%s: buffer_index:%d: size:%zu:\n", | ||
542 | __func__, buffer_index, size); | ||
543 | |||
544 | |||
545 | for (i = 0; size; i++) { | ||
546 | struct page *page = from->page_array[i]; | ||
547 | size_t n = size; | ||
548 | if (n > PAGE_SIZE) | ||
549 | n = PAGE_SIZE; | ||
550 | n = copy_page_to_iter(page, 0, n, iter); | ||
551 | if (!n) | ||
552 | return -EFAULT; | ||
553 | size -= n; | ||
554 | } | ||
555 | return 0; | ||
556 | } | ||