diff options
Diffstat (limited to 'Documentation')
-rw-r--r-- | Documentation/ABI/stable/sysfs-fs-orangefs | 87 | ||||
-rw-r--r-- | Documentation/filesystems/orangefs.txt | 406 |
2 files changed, 493 insertions, 0 deletions
diff --git a/Documentation/ABI/stable/sysfs-fs-orangefs b/Documentation/ABI/stable/sysfs-fs-orangefs new file mode 100644 index 000000000000..affdb114bd33 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-fs-orangefs | |||
@@ -0,0 +1,87 @@ | |||
1 | What: /sys/fs/orangefs/perf_counters/* | ||
2 | Date: Jun 2015 | ||
3 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
4 | Description: | ||
5 | Counters and settings for various caches. | ||
6 | Read only. | ||
7 | |||
8 | |||
9 | What: /sys/fs/orangefs/perf_counter_reset | ||
10 | Date: June 2015 | ||
11 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
12 | Description: | ||
13 | echo a 0 or a 1 into perf_counter_reset to | ||
14 | reset all the counters in | ||
15 | /sys/fs/orangefs/perf_counters | ||
16 | except ones with PINT_PERF_PRESERVE set. | ||
17 | |||
18 | |||
19 | What: /sys/fs/orangefs/perf_time_interval_secs | ||
20 | Date: Jun 2015 | ||
21 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
22 | Description: | ||
23 | Length of perf counter intervals in | ||
24 | seconds. | ||
25 | |||
26 | |||
27 | What: /sys/fs/orangefs/perf_history_size | ||
28 | Date: Jun 2015 | ||
29 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
30 | Description: | ||
31 | The perf_counters cache statistics have N, or | ||
32 | perf_history_size, samples. The default is | ||
33 | one. | ||
34 | |||
35 | Every perf_time_interval_secs the (first) | ||
36 | samples are reset. | ||
37 | |||
38 | If N is greater than one, the "current" set | ||
39 | of samples is reset, and the samples from the | ||
40 | other N-1 intervals remain available. | ||
41 | |||
42 | |||
43 | What: /sys/fs/orangefs/op_timeout_secs | ||
44 | Date: Jun 2015 | ||
45 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
46 | Description: | ||
47 | Service operation timeout in seconds. | ||
48 | |||
49 | |||
50 | What: /sys/fs/orangefs/slot_timeout_secs | ||
51 | Date: Jun 2015 | ||
52 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
53 | Description: | ||
54 | "Slot" timeout in seconds. A "slot" | ||
55 | is an indexed buffer in the shared | ||
56 | memory segment used for communication | ||
57 | between the kernel module and userspace. | ||
58 | Slots are requested and waited for, | ||
59 | the wait times out after slot_timeout_secs. | ||
60 | |||
61 | |||
62 | What: /sys/fs/orangefs/acache/* | ||
63 | Date: Jun 2015 | ||
64 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
65 | Description: | ||
66 | Attribute cache configurable settings. | ||
67 | |||
68 | |||
69 | What: /sys/fs/orangefs/ncache/* | ||
70 | Date: Jun 2015 | ||
71 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
72 | Description: | ||
73 | Name cache configurable settings. | ||
74 | |||
75 | |||
76 | What: /sys/fs/orangefs/capcache/* | ||
77 | Date: Jun 2015 | ||
78 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
79 | Description: | ||
80 | Capability cache configurable settings. | ||
81 | |||
82 | |||
83 | What: /sys/fs/orangefs/ccache/* | ||
84 | Date: Jun 2015 | ||
85 | Contact: Mike Marshall <hubcap@omnibond.com> | ||
86 | Description: | ||
87 | Credential cache configurable settings. | ||
diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt new file mode 100644 index 000000000000..e1a0056a365f --- /dev/null +++ b/Documentation/filesystems/orangefs.txt | |||
@@ -0,0 +1,406 @@ | |||
1 | ORANGEFS | ||
2 | ======== | ||
3 | |||
4 | OrangeFS is an LGPL userspace scale-out parallel storage system. It is ideal | ||
5 | for large storage problems faced by HPC, BigData, Streaming Video, | ||
6 | Genomics, Bioinformatics. | ||
7 | |||
8 | Orangefs, originally called PVFS, was first developed in 1993 by | ||
9 | Walt Ligon and Eric Blumer as a parallel file system for Parallel | ||
10 | Virtual Machine (PVM) as part of a NASA grant to study the I/O patterns | ||
11 | of parallel programs. | ||
12 | |||
13 | Orangefs features include: | ||
14 | |||
15 | * Distributes file data among multiple file servers | ||
16 | * Supports simultaneous access by multiple clients | ||
17 | * Stores file data and metadata on servers using local file system | ||
18 | and access methods | ||
19 | * Userspace implementation is easy to install and maintain | ||
20 | * Direct MPI support | ||
21 | * Stateless | ||
22 | |||
23 | |||
24 | MAILING LIST | ||
25 | ============ | ||
26 | |||
27 | http://beowulf-underground.org/mailman/listinfo/pvfs2-users | ||
28 | |||
29 | |||
30 | DOCUMENTATION | ||
31 | ============= | ||
32 | |||
33 | http://www.orangefs.org/documentation/ | ||
34 | |||
35 | |||
36 | USERSPACE FILESYSTEM SOURCE | ||
37 | =========================== | ||
38 | |||
39 | http://www.orangefs.org/download | ||
40 | |||
41 | Orangefs versions prior to 2.9.3 would not be compatible with the | ||
42 | upstream version of the kernel client. | ||
43 | |||
44 | |||
45 | BUILDING THE USERSPACE FILESYSTEM ON A SINGLE SERVER | ||
46 | ==================================================== | ||
47 | |||
48 | When Orangefs is upstream, "--with-kernel" shouldn't be needed, but | ||
49 | until then the path to where the kernel with the Orangefs kernel client | ||
50 | patch was built is needed to ensure that pvfs2-client-core (the bridge | ||
51 | between kernel space and user space) will build properly. You can omit | ||
52 | --prefix if you don't care that things are sprinkled around in | ||
53 | /usr/local. | ||
54 | |||
55 | ./configure --prefix=/opt/ofs --with-kernel=/path/to/orangefs/kernel | ||
56 | |||
57 | make | ||
58 | |||
59 | make install | ||
60 | |||
61 | Create an orangefs config file: | ||
62 | /opt/ofs/bin/pvfs2-genconfig /etc/pvfs2.conf | ||
63 | |||
64 | for "Enter hostnames", use the hostname, don't let it default to | ||
65 | localhost. | ||
66 | |||
67 | create a pvfs2tab file in /etc: | ||
68 | cat /etc/pvfs2tab | ||
69 | tcp://myhostname:3334/orangefs /mymountpoint pvfs2 defaults,noauto 0 0 | ||
70 | |||
71 | create the mount point you specified in the tab file if needed: | ||
72 | mkdir /mymountpoint | ||
73 | |||
74 | bootstrap the server: | ||
75 | /opt/ofs/sbin/pvfs2-server /etc/pvfs2.conf -f | ||
76 | |||
77 | start the server: | ||
78 | /opt/osf/sbin/pvfs2-server /etc/pvfs2.conf | ||
79 | |||
80 | Now the server is running. At this point you might like to | ||
81 | prove things are working with: | ||
82 | |||
83 | /opt/osf/bin/pvfs2-ls /mymountpoint | ||
84 | |||
85 | You might not want to enforce selinux, it doesn't seem to matter by | ||
86 | linux 3.11... | ||
87 | |||
88 | If stuff seems to be working, turn on the client core: | ||
89 | /opt/osf/sbin/pvfs2-client -p /opt/osf/sbin/pvfs2-client-core | ||
90 | |||
91 | Mount your filesystem. | ||
92 | mount -t pvfs2 tcp://myhostname:3334/orangefs /mymountpoint | ||
93 | |||
94 | |||
95 | OPTIONS | ||
96 | ======= | ||
97 | |||
98 | The following mount options are accepted: | ||
99 | |||
100 | acl | ||
101 | Allow the use of Access Control Lists on files and directories. | ||
102 | |||
103 | intr | ||
104 | Some operations between the kernel client and the user space | ||
105 | filesystem can be interruptible, such as changes in debug levels | ||
106 | and the setting of tunable parameters. | ||
107 | |||
108 | local_lock | ||
109 | Enable posix locking from the perspective of "this" kernel. The | ||
110 | default file_operations lock action is to return ENOSYS. Posix | ||
111 | locking kicks in if the filesystem is mounted with -o local_lock. | ||
112 | Distributed locking is being worked on for the future. | ||
113 | |||
114 | |||
115 | DEBUGGING | ||
116 | ========= | ||
117 | |||
118 | If you want the debug (GOSSIP) statements in a particular | ||
119 | source file (inode.c for example) go to syslog: | ||
120 | |||
121 | echo inode > /sys/kernel/debug/orangefs/kernel-debug | ||
122 | |||
123 | No debugging (the default): | ||
124 | |||
125 | echo none > /sys/kernel/debug/orangefs/kernel-debug | ||
126 | |||
127 | Debugging from several source files: | ||
128 | |||
129 | echo inode,dir > /sys/kernel/debug/orangefs/kernel-debug | ||
130 | |||
131 | All debugging: | ||
132 | |||
133 | echo all > /sys/kernel/debug/orangefs/kernel-debug | ||
134 | |||
135 | Get a list of all debugging keywords: | ||
136 | |||
137 | cat /sys/kernel/debug/orangefs/debug-help | ||
138 | |||
139 | |||
140 | PROTOCOL BETWEEN KERNEL MODULE AND USERSPACE | ||
141 | ============================================ | ||
142 | |||
143 | Orangefs is a user space filesystem and an associated kernel module. | ||
144 | We'll just refer to the user space part of Orangefs as "userspace" | ||
145 | from here on out. Orangefs descends from PVFS, and userspace code | ||
146 | still uses PVFS for function and variable names. Userspace typedefs | ||
147 | many of the important structures. Function and variable names in | ||
148 | the kernel module have been transitioned to "orangefs", and The Linux | ||
149 | Coding Style avoids typedefs, so kernel module structures that | ||
150 | correspond to userspace structures are not typedefed. | ||
151 | |||
152 | The kernel module implements a pseudo device that userspace | ||
153 | can read from and write to. Userspace can also manipulate the | ||
154 | kernel module through the pseudo device with ioctl. | ||
155 | |||
156 | THE BUFMAP: | ||
157 | |||
158 | At startup userspace allocates two page-size-aligned (posix_memalign) | ||
159 | mlocked memory buffers, one is used for IO and one is used for readdir | ||
160 | operations. The IO buffer is 41943040 bytes and the readdir buffer is | ||
161 | 4194304 bytes. Each buffer contains logical chunks, or partitions, and | ||
162 | a pointer to each buffer is added to its own PVFS_dev_map_desc structure | ||
163 | which also describes its total size, as well as the size and number of | ||
164 | the partitions. | ||
165 | |||
166 | A pointer to the IO buffer's PVFS_dev_map_desc structure is sent to a | ||
167 | mapping routine in the kernel module with an ioctl. The structure is | ||
168 | copied from user space to kernel space with copy_from_user and is used | ||
169 | to initialize the kernel module's "bufmap" (struct orangefs_bufmap), which | ||
170 | then contains: | ||
171 | |||
172 | * refcnt - a reference counter | ||
173 | * desc_size - PVFS2_BUFMAP_DEFAULT_DESC_SIZE (4194304) - the IO buffer's | ||
174 | partition size, which represents the filesystem's block size and | ||
175 | is used for s_blocksize in super blocks. | ||
176 | * desc_count - PVFS2_BUFMAP_DEFAULT_DESC_COUNT (10) - the number of | ||
177 | partitions in the IO buffer. | ||
178 | * desc_shift - log2(desc_size), used for s_blocksize_bits in super blocks. | ||
179 | * total_size - the total size of the IO buffer. | ||
180 | * page_count - the number of 4096 byte pages in the IO buffer. | ||
181 | * page_array - a pointer to page_count * (sizeof(struct page*)) bytes | ||
182 | of kcalloced memory. This memory is used as an array of pointers | ||
183 | to each of the pages in the IO buffer through a call to get_user_pages. | ||
184 | * desc_array - a pointer to desc_count * (sizeof(struct orangefs_bufmap_desc)) | ||
185 | bytes of kcalloced memory. This memory is further intialized: | ||
186 | |||
187 | user_desc is the kernel's copy of the IO buffer's ORANGEFS_dev_map_desc | ||
188 | structure. user_desc->ptr points to the IO buffer. | ||
189 | |||
190 | pages_per_desc = bufmap->desc_size / PAGE_SIZE | ||
191 | offset = 0 | ||
192 | |||
193 | bufmap->desc_array[0].page_array = &bufmap->page_array[offset] | ||
194 | bufmap->desc_array[0].array_count = pages_per_desc = 1024 | ||
195 | bufmap->desc_array[0].uaddr = (user_desc->ptr) + (0 * 1024 * 4096) | ||
196 | offset += 1024 | ||
197 | . | ||
198 | . | ||
199 | . | ||
200 | bufmap->desc_array[9].page_array = &bufmap->page_array[offset] | ||
201 | bufmap->desc_array[9].array_count = pages_per_desc = 1024 | ||
202 | bufmap->desc_array[9].uaddr = (user_desc->ptr) + | ||
203 | (9 * 1024 * 4096) | ||
204 | offset += 1024 | ||
205 | |||
206 | * buffer_index_array - a desc_count sized array of ints, used to | ||
207 | indicate which of the IO buffer's partitions are available to use. | ||
208 | * buffer_index_lock - a spinlock to protect buffer_index_array during update. | ||
209 | * readdir_index_array - a five (ORANGEFS_READDIR_DEFAULT_DESC_COUNT) element | ||
210 | int array used to indicate which of the readdir buffer's partitions are | ||
211 | available to use. | ||
212 | * readdir_index_lock - a spinlock to protect readdir_index_array during | ||
213 | update. | ||
214 | |||
215 | OPERATIONS: | ||
216 | |||
217 | The kernel module builds an "op" (struct orangefs_kernel_op_s) when it | ||
218 | needs to communicate with userspace. Part of the op contains the "upcall" | ||
219 | which expresses the request to userspace. Part of the op eventually | ||
220 | contains the "downcall" which expresses the results of the request. | ||
221 | |||
222 | The slab allocator is used to keep a cache of op structures handy. | ||
223 | |||
224 | At init time the kernel module defines and initializes a request list | ||
225 | and an in_progress hash table to keep track of all the ops that are | ||
226 | in flight at any given time. | ||
227 | |||
228 | Ops are stateful: | ||
229 | |||
230 | * unknown - op was just initialized | ||
231 | * waiting - op is on request_list (upward bound) | ||
232 | * inprogr - op is in progress (waiting for downcall) | ||
233 | * serviced - op has matching downcall; ok | ||
234 | * purged - op has to start a timer since client-core | ||
235 | exited uncleanly before servicing op | ||
236 | * given up - submitter has given up waiting for it | ||
237 | |||
238 | When some arbitrary userspace program needs to perform a | ||
239 | filesystem operation on Orangefs (readdir, I/O, create, whatever) | ||
240 | an op structure is initialized and tagged with a distinguishing ID | ||
241 | number. The upcall part of the op is filled out, and the op is | ||
242 | passed to the "service_operation" function. | ||
243 | |||
244 | Service_operation changes the op's state to "waiting", puts | ||
245 | it on the request list, and signals the Orangefs file_operations.poll | ||
246 | function through a wait queue. Userspace is polling the pseudo-device | ||
247 | and thus becomes aware of the upcall request that needs to be read. | ||
248 | |||
249 | When the Orangefs file_operations.read function is triggered, the | ||
250 | request list is searched for an op that seems ready-to-process. | ||
251 | The op is removed from the request list. The tag from the op and | ||
252 | the filled-out upcall struct are copy_to_user'ed back to userspace. | ||
253 | |||
254 | If any of these (and some additional protocol) copy_to_users fail, | ||
255 | the op's state is set to "waiting" and the op is added back to | ||
256 | the request list. Otherwise, the op's state is changed to "in progress", | ||
257 | and the op is hashed on its tag and put onto the end of a list in the | ||
258 | in_progress hash table at the index the tag hashed to. | ||
259 | |||
260 | When userspace has assembled the response to the upcall, it | ||
261 | writes the response, which includes the distinguishing tag, back to | ||
262 | the pseudo device in a series of io_vecs. This triggers the Orangefs | ||
263 | file_operations.write_iter function to find the op with the associated | ||
264 | tag and remove it from the in_progress hash table. As long as the op's | ||
265 | state is not "canceled" or "given up", its state is set to "serviced". | ||
266 | The file_operations.write_iter function returns to the waiting vfs, | ||
267 | and back to service_operation through wait_for_matching_downcall. | ||
268 | |||
269 | Service operation returns to its caller with the op's downcall | ||
270 | part (the response to the upcall) filled out. | ||
271 | |||
272 | The "client-core" is the bridge between the kernel module and | ||
273 | userspace. The client-core is a daemon. The client-core has an | ||
274 | associated watchdog daemon. If the client-core is ever signaled | ||
275 | to die, the watchdog daemon restarts the client-core. Even though | ||
276 | the client-core is restarted "right away", there is a period of | ||
277 | time during such an event that the client-core is dead. A dead client-core | ||
278 | can't be triggered by the Orangefs file_operations.poll function. | ||
279 | Ops that pass through service_operation during a "dead spell" can timeout | ||
280 | on the wait queue and one attempt is made to recycle them. Obviously, | ||
281 | if the client-core stays dead too long, the arbitrary userspace processes | ||
282 | trying to use Orangefs will be negatively affected. Waiting ops | ||
283 | that can't be serviced will be removed from the request list and | ||
284 | have their states set to "given up". In-progress ops that can't | ||
285 | be serviced will be removed from the in_progress hash table and | ||
286 | have their states set to "given up". | ||
287 | |||
288 | Readdir and I/O ops are atypical with respect to their payloads. | ||
289 | |||
290 | - readdir ops use the smaller of the two pre-allocated pre-partitioned | ||
291 | memory buffers. The readdir buffer is only available to userspace. | ||
292 | The kernel module obtains an index to a free partition before launching | ||
293 | a readdir op. Userspace deposits the results into the indexed partition | ||
294 | and then writes them to back to the pvfs device. | ||
295 | |||
296 | - io (read and write) ops use the larger of the two pre-allocated | ||
297 | pre-partitioned memory buffers. The IO buffer is accessible from | ||
298 | both userspace and the kernel module. The kernel module obtains an | ||
299 | index to a free partition before launching an io op. The kernel module | ||
300 | deposits write data into the indexed partition, to be consumed | ||
301 | directly by userspace. Userspace deposits the results of read | ||
302 | requests into the indexed partition, to be consumed directly | ||
303 | by the kernel module. | ||
304 | |||
305 | Responses to kernel requests are all packaged in pvfs2_downcall_t | ||
306 | structs. Besides a few other members, pvfs2_downcall_t contains a | ||
307 | union of structs, each of which is associated with a particular | ||
308 | response type. | ||
309 | |||
310 | The several members outside of the union are: | ||
311 | - int32_t type - type of operation. | ||
312 | - int32_t status - return code for the operation. | ||
313 | - int64_t trailer_size - 0 unless readdir operation. | ||
314 | - char *trailer_buf - initialized to NULL, used during readdir operations. | ||
315 | |||
316 | The appropriate member inside the union is filled out for any | ||
317 | particular response. | ||
318 | |||
319 | PVFS2_VFS_OP_FILE_IO | ||
320 | fill a pvfs2_io_response_t | ||
321 | |||
322 | PVFS2_VFS_OP_LOOKUP | ||
323 | fill a PVFS_object_kref | ||
324 | |||
325 | PVFS2_VFS_OP_CREATE | ||
326 | fill a PVFS_object_kref | ||
327 | |||
328 | PVFS2_VFS_OP_SYMLINK | ||
329 | fill a PVFS_object_kref | ||
330 | |||
331 | PVFS2_VFS_OP_GETATTR | ||
332 | fill in a PVFS_sys_attr_s (tons of stuff the kernel doesn't need) | ||
333 | fill in a string with the link target when the object is a symlink. | ||
334 | |||
335 | PVFS2_VFS_OP_MKDIR | ||
336 | fill a PVFS_object_kref | ||
337 | |||
338 | PVFS2_VFS_OP_STATFS | ||
339 | fill a pvfs2_statfs_response_t with useless info <g>. It is hard for | ||
340 | us to know, in a timely fashion, these statistics about our | ||
341 | distributed network filesystem. | ||
342 | |||
343 | PVFS2_VFS_OP_FS_MOUNT | ||
344 | fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref | ||
345 | except its members are in a different order and "__pad1" is replaced | ||
346 | with "id". | ||
347 | |||
348 | PVFS2_VFS_OP_GETXATTR | ||
349 | fill a pvfs2_getxattr_response_t | ||
350 | |||
351 | PVFS2_VFS_OP_LISTXATTR | ||
352 | fill a pvfs2_listxattr_response_t | ||
353 | |||
354 | PVFS2_VFS_OP_PARAM | ||
355 | fill a pvfs2_param_response_t | ||
356 | |||
357 | PVFS2_VFS_OP_PERF_COUNT | ||
358 | fill a pvfs2_perf_count_response_t | ||
359 | |||
360 | PVFS2_VFS_OP_FSKEY | ||
361 | file a pvfs2_fs_key_response_t | ||
362 | |||
363 | PVFS2_VFS_OP_READDIR | ||
364 | jamb everything needed to represent a pvfs2_readdir_response_t into | ||
365 | the readdir buffer descriptor specified in the upcall. | ||
366 | |||
367 | Userspace uses writev() on /dev/pvfs2-req to pass responses to the requests | ||
368 | made by the kernel side. | ||
369 | |||
370 | A buffer_list containing: | ||
371 | - a pointer to the prepared response to the request from the | ||
372 | kernel (struct pvfs2_downcall_t). | ||
373 | - and also, in the case of a readdir request, a pointer to a | ||
374 | buffer containing descriptors for the objects in the target | ||
375 | directory. | ||
376 | ... is sent to the function (PINT_dev_write_list) which performs | ||
377 | the writev. | ||
378 | |||
379 | PINT_dev_write_list has a local iovec array: struct iovec io_array[10]; | ||
380 | |||
381 | The first four elements of io_array are initialized like this for all | ||
382 | responses: | ||
383 | |||
384 | io_array[0].iov_base = address of local variable "proto_ver" (int32_t) | ||
385 | io_array[0].iov_len = sizeof(int32_t) | ||
386 | |||
387 | io_array[1].iov_base = address of global variable "pdev_magic" (int32_t) | ||
388 | io_array[1].iov_len = sizeof(int32_t) | ||
389 | |||
390 | io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t) | ||
391 | io_array[2].iov_len = sizeof(int64_t) | ||
392 | |||
393 | io_array[3].iov_base = address of out_downcall member (pvfs2_downcall_t) | ||
394 | of global variable vfs_request (vfs_request_t) | ||
395 | io_array[3].iov_len = sizeof(pvfs2_downcall_t) | ||
396 | |||
397 | Readdir responses initialize the fifth element io_array like this: | ||
398 | |||
399 | io_array[4].iov_base = contents of member trailer_buf (char *) | ||
400 | from out_downcall member of global variable | ||
401 | vfs_request | ||
402 | io_array[4].iov_len = contents of member trailer_size (PVFS_size) | ||
403 | from out_downcall member of global variable | ||
404 | vfs_request | ||
405 | |||
406 | |||