diff options
| author | Mike Marshall <hubcap@omnibond.com> | 2016-01-13 14:28:13 -0500 |
|---|---|---|
| committer | Mike Marshall <hubcap@omnibond.com> | 2016-01-13 14:28:13 -0500 |
| commit | fcac9d571567e8bf952616f4a271eea5b4b407ea (patch) | |
| tree | eb793a1870f7aedd76486de681b4bf1ed8bb1285 /Documentation | |
| parent | be57366e14d8341f5d2b589d5b59151895afe210 (diff) | |
Orangefs: add protocol information to Documentation/filesystems/orangefs.txt
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
Diffstat (limited to 'Documentation')
| -rw-r--r-- | Documentation/filesystems/orangefs.txt | 218 |
1 files changed, 217 insertions, 1 deletions
diff --git a/Documentation/filesystems/orangefs.txt b/Documentation/filesystems/orangefs.txt index ec9c8416427e..925a53e52097 100644 --- a/Documentation/filesystems/orangefs.txt +++ b/Documentation/filesystems/orangefs.txt | |||
| @@ -115,7 +115,7 @@ The following mount options are accepted: | |||
| 115 | DEBUGGING | 115 | DEBUGGING |
| 116 | ========= | 116 | ========= |
| 117 | 117 | ||
| 118 | If you want the debug (GOSSIP) statments in a particular | 118 | If you want the debug (GOSSIP) statements in a particular |
| 119 | source file (inode.c for example) go to syslog: | 119 | source file (inode.c for example) go to syslog: |
| 120 | 120 | ||
| 121 | echo inode > /sys/kernel/debug/orangefs/kernel-debug | 121 | echo inode > /sys/kernel/debug/orangefs/kernel-debug |
| @@ -135,3 +135,219 @@ All debugging: | |||
| 135 | Get a list of all debugging keywords: | 135 | Get a list of all debugging keywords: |
| 136 | 136 | ||
| 137 | cat /sys/kernel/debug/orangefs/debug-help | 137 | cat /sys/kernel/debug/orangefs/debug-help |
| 138 | |||
| 139 | |||
| 140 | PROTOCOL BETWEEN KERNEL MODULE AND USERSPACE | ||
| 141 | ============================================ | ||
| 142 | |||
| 143 | Orangefs is a user space filesystem and an associated kernel module. | ||
| 144 | We'll just refer to the user space part of Orangefs as "userspace" | ||
| 145 | from here on out. Orangefs descends from PVFS, and userspace code | ||
| 146 | still uses PVFS for function and variable names. Userspace typedefs | ||
| 147 | many of the important structures. Function and variable names in | ||
| 148 | the kernel module have been transitioned to "orangefs", and The Linux | ||
| 149 | Coding Style avoids typedefs, so kernel module structures that | ||
| 150 | correspond to userspace structures are not typedefed. | ||
| 151 | |||
| 152 | The kernel module implements a pseudo device that userspace | ||
| 153 | can read from and write to. Userspace can also manipulate the | ||
| 154 | kernel module through the pseudo device with ioctl. | ||
| 155 | |||
| 156 | THE BUFMAP: | ||
| 157 | |||
| 158 | At startup userspace allocates two page-size-aligned (posix_memalign) | ||
| 159 | mlocked memory buffers, one is used for IO and one is used for readdir | ||
| 160 | operations. The IO buffer is 41943040 bytes and the readdir buffer is | ||
| 161 | 4194304 bytes. Each buffer contains logical chunks, or partitions, and | ||
| 162 | a pointer to each buffer is added to its own PVFS_dev_map_desc structure | ||
| 163 | which also describes its total size, as well as the size and number of | ||
| 164 | the partitions. | ||
| 165 | |||
| 166 | A pointer to the IO buffer's PVFS_dev_map_desc structure is sent to a | ||
| 167 | mapping routine in the kernel module with an ioctl. The structure is | ||
| 168 | copied from user space to kernel space with copy_from_user and is used | ||
| 169 | to initialize the kernel module's "bufmap" (struct orangefs_bufmap), which | ||
| 170 | then contains: | ||
| 171 | |||
| 172 | * refcnt - a reference counter | ||
| 173 | * desc_size - PVFS2_BUFMAP_DEFAULT_DESC_SIZE (4194304) - the IO buffer's | ||
| 174 | partition size, which represents the filesystem's block size and | ||
| 175 | is used for s_blocksize in super blocks. | ||
| 176 | * desc_count - PVFS2_BUFMAP_DEFAULT_DESC_COUNT (10) - the number of | ||
| 177 | partitions in the IO buffer. | ||
| 178 | * desc_shift - log2(desc_size), used for s_blocksize_bits in super blocks. | ||
| 179 | * total_size - the total size of the IO buffer. | ||
| 180 | * page_count - the number of 4096 byte pages in the IO buffer. | ||
| 181 | * page_array - a pointer to page_count * (sizeof(struct page*)) bytes | ||
| 182 | of kcalloced memory. This memory is used as an array of pointers | ||
| 183 | to each of the pages in the IO buffer through a call to get_user_pages. | ||
| 184 | * desc_array - a pointer to desc_count * (sizeof(struct orangefs_bufmap_desc)) | ||
| 185 | bytes of kcalloced memory. This memory is further intialized: | ||
| 186 | |||
| 187 | user_desc is the kernel's copy of the IO buffer's ORANGEFS_dev_map_desc | ||
| 188 | structure. user_desc->ptr points to the IO buffer. | ||
| 189 | |||
| 190 | pages_per_desc = bufmap->desc_size / PAGE_SIZE | ||
| 191 | offset = 0 | ||
| 192 | |||
| 193 | bufmap->desc_array[0].page_array = &bufmap->page_array[offset] | ||
| 194 | bufmap->desc_array[0].array_count = pages_per_desc = 1024 | ||
| 195 | bufmap->desc_array[0].uaddr = (user_desc->ptr) + (0 * 1024 * 4096) | ||
| 196 | offset += 1024 | ||
| 197 | . | ||
| 198 | . | ||
| 199 | . | ||
| 200 | bufmap->desc_array[9].page_array = &bufmap->page_array[offset] | ||
| 201 | bufmap->desc_array[9].array_count = pages_per_desc = 1024 | ||
| 202 | bufmap->desc_array[9].uaddr = (user_desc->ptr) + | ||
| 203 | (9 * 1024 * 4096) | ||
| 204 | offset += 1024 | ||
| 205 | |||
| 206 | * buffer_index_array - a desc_count sized array of ints, used to | ||
| 207 | indicate which of the IO buffer's partitions are available to use. | ||
| 208 | * buffer_index_lock - a spinlock to protect buffer_index_array during update. | ||
| 209 | * readdir_index_array - a five (ORANGEFS_READDIR_DEFAULT_DESC_COUNT) element | ||
| 210 | int array used to indicate which of the readdir buffer's partitions are | ||
| 211 | available to use. | ||
| 212 | * readdir_index_lock - a spinlock to protect readdir_index_array during | ||
| 213 | update. | ||
| 214 | |||
| 215 | OPERATIONS: | ||
| 216 | |||
| 217 | The kernel module builds an "op" (struct orangefs_kernel_op_s) when it | ||
| 218 | needs to communicate with userspace. Part of the op contains the "upcall" | ||
| 219 | which expresses the request to userspace. Part of the op eventually | ||
| 220 | contains the "downcall" which expresses the results of the request. | ||
| 221 | |||
| 222 | The slab allocator is used to keep a cache of op structures handy. | ||
| 223 | |||
| 224 | The life cycle of a typical op goes like this: | ||
| 225 | |||
| 226 | - obtain and initialize an op structure from the op_cache. | ||
| 227 | |||
| 228 | - queue the op to the pvfs device so that its upcall data can be | ||
| 229 | read by userspace. | ||
| 230 | |||
| 231 | - wait for userspace to write downcall data back to the pvfs device. | ||
| 232 | |||
| 233 | - consume the downcall and return the op struct to the op_cache. | ||
| 234 | |||
| 235 | Some ops are atypical with respect to their payloads: readdir and io ops. | ||
| 236 | |||
| 237 | - readdir ops use the smaller of the two pre-allocated pre-partitioned | ||
| 238 | memory buffers. The readdir buffer is only available to userspace. | ||
| 239 | The kernel module obtains an index to a free partition before launching | ||
| 240 | a readdir op. Userspace deposits the results into the indexed partition | ||
| 241 | and then writes them to back to the pvfs device. | ||
| 242 | |||
| 243 | - io (read and write) ops use the larger of the two pre-allocated | ||
| 244 | pre-partitioned memory buffers. The IO buffer is accessible from | ||
| 245 | both userspace and the kernel module. The kernel module obtains an | ||
| 246 | index to a free partition before launching an io op. The kernel module | ||
| 247 | deposits write data into the indexed partition, to be consumed | ||
| 248 | directly by userspace. Userspace deposits the results of read | ||
| 249 | requests into the indexed partition, to be consumed directly | ||
| 250 | by the kernel module. | ||
| 251 | |||
| 252 | Responses to kernel requests are all packaged in pvfs2_downcall_t | ||
| 253 | structs. Besides a few other members, pvfs2_downcall_t contains a | ||
| 254 | union of structs, each of which is associated with a particular | ||
| 255 | response type. | ||
| 256 | |||
| 257 | The several members outside of the union are: | ||
| 258 | - int32_t type - type of operation. | ||
| 259 | - int32_t status - return code for the operation. | ||
| 260 | - int64_t trailer_size - 0 unless readdir operation. | ||
| 261 | - char *trailer_buf - initialized to NULL, used during readdir operations. | ||
| 262 | |||
| 263 | The appropriate member inside the union is filled out for any | ||
| 264 | particular response. | ||
| 265 | |||
| 266 | PVFS2_VFS_OP_FILE_IO | ||
| 267 | fill a pvfs2_io_response_t | ||
| 268 | |||
| 269 | PVFS2_VFS_OP_LOOKUP | ||
| 270 | fill a PVFS_object_kref | ||
| 271 | |||
| 272 | PVFS2_VFS_OP_CREATE | ||
| 273 | fill a PVFS_object_kref | ||
| 274 | |||
| 275 | PVFS2_VFS_OP_SYMLINK | ||
| 276 | fill a PVFS_object_kref | ||
| 277 | |||
| 278 | PVFS2_VFS_OP_GETATTR | ||
| 279 | fill in a PVFS_sys_attr_s (tons of stuff the kernel doesn't need) | ||
| 280 | fill in a string with the link target when the object is a symlink. | ||
| 281 | |||
| 282 | PVFS2_VFS_OP_MKDIR | ||
| 283 | fill a PVFS_object_kref | ||
| 284 | |||
| 285 | PVFS2_VFS_OP_STATFS | ||
| 286 | fill a pvfs2_statfs_response_t with useless info <g>. It is hard for | ||
| 287 | us to know, in a timely fashion, these statistics about our | ||
| 288 | distributed network filesystem. | ||
| 289 | |||
| 290 | PVFS2_VFS_OP_FS_MOUNT | ||
| 291 | fill a pvfs2_fs_mount_response_t which is just like a PVFS_object_kref | ||
| 292 | except its members are in a different order and "__pad1" is replaced | ||
| 293 | with "id". | ||
| 294 | |||
| 295 | PVFS2_VFS_OP_GETXATTR | ||
| 296 | fill a pvfs2_getxattr_response_t | ||
| 297 | |||
| 298 | PVFS2_VFS_OP_LISTXATTR | ||
| 299 | fill a pvfs2_listxattr_response_t | ||
| 300 | |||
| 301 | PVFS2_VFS_OP_PARAM | ||
| 302 | fill a pvfs2_param_response_t | ||
| 303 | |||
| 304 | PVFS2_VFS_OP_PERF_COUNT | ||
| 305 | fill a pvfs2_perf_count_response_t | ||
| 306 | |||
| 307 | PVFS2_VFS_OP_FSKEY | ||
| 308 | file a pvfs2_fs_key_response_t | ||
| 309 | |||
| 310 | PVFS2_VFS_OP_READDIR | ||
| 311 | jamb everything needed to represent a pvfs2_readdir_response_t into | ||
| 312 | the readdir buffer descriptor specified in the upcall. | ||
| 313 | |||
| 314 | writev() on /dev/pvfs2-req is used to pass responses to the requests | ||
| 315 | made by the kernel side. | ||
| 316 | |||
| 317 | A buffer_list containing: | ||
| 318 | - a pointer to the prepared response to the request from the | ||
| 319 | kernel (struct pvfs2_downcall_t). | ||
| 320 | - and also, in the case of a readdir request, a pointer to a | ||
| 321 | buffer containing descriptors for the objects in the target | ||
| 322 | directory. | ||
| 323 | ... is sent to the function (PINT_dev_write_list) which performs | ||
| 324 | the writev. | ||
| 325 | |||
| 326 | PINT_dev_write_list has a local iovec array: struct iovec io_array[10]; | ||
| 327 | |||
| 328 | The first four elements of io_array are initialized like this for all | ||
| 329 | responses: | ||
| 330 | |||
| 331 | io_array[0].iov_base = address of local variable "proto_ver" (int32_t) | ||
| 332 | io_array[0].iov_len = sizeof(int32_t) | ||
| 333 | |||
| 334 | io_array[1].iov_base = address of global variable "pdev_magic" (int32_t) | ||
| 335 | io_array[1].iov_len = sizeof(int32_t) | ||
| 336 | |||
| 337 | io_array[2].iov_base = address of parameter "tag" (PVFS_id_gen_t) | ||
| 338 | io_array[2].iov_len = sizeof(int64_t) | ||
| 339 | |||
| 340 | io_array[3].iov_base = address of out_downcall member (pvfs2_downcall_t) | ||
| 341 | of global variable vfs_request (vfs_request_t) | ||
| 342 | io_array[3].iov_len = sizeof(pvfs2_downcall_t) | ||
| 343 | |||
| 344 | Readdir responses initialize the fifth element io_array like this: | ||
| 345 | |||
| 346 | io_array[4].iov_base = contents of member trailer_buf (char *) | ||
| 347 | from out_downcall member of global variable | ||
| 348 | vfs_request | ||
| 349 | io_array[4].iov_len = contents of member trailer_size (PVFS_size) | ||
| 350 | from out_downcall member of global variable | ||
| 351 | vfs_request | ||
| 352 | |||
| 353 | |||
