diff options
| -rw-r--r-- | Documentation/filesystems/api-summary.rst | 150 | ||||
| -rw-r--r-- | Documentation/filesystems/index.rst | 394 | ||||
| -rw-r--r-- | Documentation/filesystems/journalling.rst | 184 | ||||
| -rw-r--r-- | Documentation/filesystems/path-lookup.rst | 15 | ||||
| -rw-r--r-- | Documentation/filesystems/splice.rst | 22 |
5 files changed, 395 insertions, 370 deletions
diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst new file mode 100644 index 000000000000..aa51ffcfa029 --- /dev/null +++ b/Documentation/filesystems/api-summary.rst | |||
| @@ -0,0 +1,150 @@ | |||
| 1 | ============================= | ||
| 2 | Linux Filesystems API summary | ||
| 3 | ============================= | ||
| 4 | |||
| 5 | This section contains API-level documentation, mostly taken from the source | ||
| 6 | code itself. | ||
| 7 | |||
| 8 | The Linux VFS | ||
| 9 | ============= | ||
| 10 | |||
| 11 | The Filesystem types | ||
| 12 | -------------------- | ||
| 13 | |||
| 14 | .. kernel-doc:: include/linux/fs.h | ||
| 15 | :internal: | ||
| 16 | |||
| 17 | The Directory Cache | ||
| 18 | ------------------- | ||
| 19 | |||
| 20 | .. kernel-doc:: fs/dcache.c | ||
| 21 | :export: | ||
| 22 | |||
| 23 | .. kernel-doc:: include/linux/dcache.h | ||
| 24 | :internal: | ||
| 25 | |||
| 26 | Inode Handling | ||
| 27 | -------------- | ||
| 28 | |||
| 29 | .. kernel-doc:: fs/inode.c | ||
| 30 | :export: | ||
| 31 | |||
| 32 | .. kernel-doc:: fs/bad_inode.c | ||
| 33 | :export: | ||
| 34 | |||
| 35 | Registration and Superblocks | ||
| 36 | ---------------------------- | ||
| 37 | |||
| 38 | .. kernel-doc:: fs/super.c | ||
| 39 | :export: | ||
| 40 | |||
| 41 | File Locks | ||
| 42 | ---------- | ||
| 43 | |||
| 44 | .. kernel-doc:: fs/locks.c | ||
| 45 | :export: | ||
| 46 | |||
| 47 | .. kernel-doc:: fs/locks.c | ||
| 48 | :internal: | ||
| 49 | |||
| 50 | Other Functions | ||
| 51 | --------------- | ||
| 52 | |||
| 53 | .. kernel-doc:: fs/mpage.c | ||
| 54 | :export: | ||
| 55 | |||
| 56 | .. kernel-doc:: fs/namei.c | ||
| 57 | :export: | ||
| 58 | |||
| 59 | .. kernel-doc:: fs/buffer.c | ||
| 60 | :export: | ||
| 61 | |||
| 62 | .. kernel-doc:: block/bio.c | ||
| 63 | :export: | ||
| 64 | |||
| 65 | .. kernel-doc:: fs/seq_file.c | ||
| 66 | :export: | ||
| 67 | |||
| 68 | .. kernel-doc:: fs/filesystems.c | ||
| 69 | :export: | ||
| 70 | |||
| 71 | .. kernel-doc:: fs/fs-writeback.c | ||
| 72 | :export: | ||
| 73 | |||
| 74 | .. kernel-doc:: fs/block_dev.c | ||
| 75 | :export: | ||
| 76 | |||
| 77 | .. kernel-doc:: fs/anon_inodes.c | ||
| 78 | :export: | ||
| 79 | |||
| 80 | .. kernel-doc:: fs/attr.c | ||
| 81 | :export: | ||
| 82 | |||
| 83 | .. kernel-doc:: fs/d_path.c | ||
| 84 | :export: | ||
| 85 | |||
| 86 | .. kernel-doc:: fs/dax.c | ||
| 87 | :export: | ||
| 88 | |||
| 89 | .. kernel-doc:: fs/direct-io.c | ||
| 90 | :export: | ||
| 91 | |||
| 92 | .. kernel-doc:: fs/file_table.c | ||
| 93 | :export: | ||
| 94 | |||
| 95 | .. kernel-doc:: fs/libfs.c | ||
| 96 | :export: | ||
| 97 | |||
| 98 | .. kernel-doc:: fs/posix_acl.c | ||
| 99 | :export: | ||
| 100 | |||
| 101 | .. kernel-doc:: fs/stat.c | ||
| 102 | :export: | ||
| 103 | |||
| 104 | .. kernel-doc:: fs/sync.c | ||
| 105 | :export: | ||
| 106 | |||
| 107 | .. kernel-doc:: fs/xattr.c | ||
| 108 | :export: | ||
| 109 | |||
| 110 | The proc filesystem | ||
| 111 | =================== | ||
| 112 | |||
| 113 | sysctl interface | ||
| 114 | ---------------- | ||
| 115 | |||
| 116 | .. kernel-doc:: kernel/sysctl.c | ||
| 117 | :export: | ||
| 118 | |||
| 119 | proc filesystem interface | ||
| 120 | ------------------------- | ||
| 121 | |||
| 122 | .. kernel-doc:: fs/proc/base.c | ||
| 123 | :internal: | ||
| 124 | |||
| 125 | Events based on file descriptors | ||
| 126 | ================================ | ||
| 127 | |||
| 128 | .. kernel-doc:: fs/eventfd.c | ||
| 129 | :export: | ||
| 130 | |||
| 131 | The Filesystem for Exporting Kernel Objects | ||
| 132 | =========================================== | ||
| 133 | |||
| 134 | .. kernel-doc:: fs/sysfs/file.c | ||
| 135 | :export: | ||
| 136 | |||
| 137 | .. kernel-doc:: fs/sysfs/symlink.c | ||
| 138 | :export: | ||
| 139 | |||
| 140 | The debugfs filesystem | ||
| 141 | ====================== | ||
| 142 | |||
| 143 | debugfs interface | ||
| 144 | ----------------- | ||
| 145 | |||
| 146 | .. kernel-doc:: fs/debugfs/inode.c | ||
| 147 | :export: | ||
| 148 | |||
| 149 | .. kernel-doc:: fs/debugfs/file.c | ||
| 150 | :export: | ||
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 61d2441b25d5..1131c34d77f6 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst | |||
| @@ -1,389 +1,43 @@ | |||
| 1 | ===================== | 1 | =============================== |
| 2 | Linux Filesystems API | 2 | Filesystems in the Linux kernel |
| 3 | ===================== | 3 | =============================== |
| 4 | 4 | ||
| 5 | The Linux VFS | 5 | This under-development manual will, some glorious day, provide |
| 6 | ============= | 6 | comprehensive information on how the Linux virtual filesystem (VFS) layer |
| 7 | works, along with the filesystems that sit below it. For now, what we have | ||
| 8 | can be found below. | ||
| 7 | 9 | ||
| 8 | The Filesystem types | 10 | Core VFS documentation |
| 9 | -------------------- | ||
| 10 | |||
| 11 | .. kernel-doc:: include/linux/fs.h | ||
| 12 | :internal: | ||
| 13 | |||
| 14 | The Directory Cache | ||
| 15 | ------------------- | ||
| 16 | |||
| 17 | .. kernel-doc:: fs/dcache.c | ||
| 18 | :export: | ||
| 19 | |||
| 20 | .. kernel-doc:: include/linux/dcache.h | ||
| 21 | :internal: | ||
| 22 | |||
| 23 | Inode Handling | ||
| 24 | -------------- | ||
| 25 | |||
| 26 | .. kernel-doc:: fs/inode.c | ||
| 27 | :export: | ||
| 28 | |||
| 29 | .. kernel-doc:: fs/bad_inode.c | ||
| 30 | :export: | ||
| 31 | |||
| 32 | Registration and Superblocks | ||
| 33 | ---------------------------- | ||
| 34 | |||
| 35 | .. kernel-doc:: fs/super.c | ||
| 36 | :export: | ||
| 37 | |||
| 38 | File Locks | ||
| 39 | ---------- | ||
| 40 | |||
| 41 | .. kernel-doc:: fs/locks.c | ||
| 42 | :export: | ||
| 43 | |||
| 44 | .. kernel-doc:: fs/locks.c | ||
| 45 | :internal: | ||
| 46 | |||
| 47 | Other Functions | ||
| 48 | --------------- | ||
| 49 | |||
| 50 | .. kernel-doc:: fs/mpage.c | ||
| 51 | :export: | ||
| 52 | |||
| 53 | .. kernel-doc:: fs/namei.c | ||
| 54 | :export: | ||
| 55 | |||
| 56 | .. kernel-doc:: fs/buffer.c | ||
| 57 | :export: | ||
| 58 | |||
| 59 | .. kernel-doc:: block/bio.c | ||
| 60 | :export: | ||
| 61 | |||
| 62 | .. kernel-doc:: fs/seq_file.c | ||
| 63 | :export: | ||
| 64 | |||
| 65 | .. kernel-doc:: fs/filesystems.c | ||
| 66 | :export: | ||
| 67 | |||
| 68 | .. kernel-doc:: fs/fs-writeback.c | ||
| 69 | :export: | ||
| 70 | |||
| 71 | .. kernel-doc:: fs/block_dev.c | ||
| 72 | :export: | ||
| 73 | |||
| 74 | .. kernel-doc:: fs/anon_inodes.c | ||
| 75 | :export: | ||
| 76 | |||
| 77 | .. kernel-doc:: fs/attr.c | ||
| 78 | :export: | ||
| 79 | |||
| 80 | .. kernel-doc:: fs/d_path.c | ||
| 81 | :export: | ||
| 82 | |||
| 83 | .. kernel-doc:: fs/dax.c | ||
| 84 | :export: | ||
| 85 | |||
| 86 | .. kernel-doc:: fs/direct-io.c | ||
| 87 | :export: | ||
| 88 | |||
| 89 | .. kernel-doc:: fs/file_table.c | ||
| 90 | :export: | ||
| 91 | |||
| 92 | .. kernel-doc:: fs/libfs.c | ||
| 93 | :export: | ||
| 94 | |||
| 95 | .. kernel-doc:: fs/posix_acl.c | ||
| 96 | :export: | ||
| 97 | |||
| 98 | .. kernel-doc:: fs/stat.c | ||
| 99 | :export: | ||
| 100 | |||
| 101 | .. kernel-doc:: fs/sync.c | ||
| 102 | :export: | ||
| 103 | |||
| 104 | .. kernel-doc:: fs/xattr.c | ||
| 105 | :export: | ||
| 106 | |||
| 107 | The proc filesystem | ||
| 108 | =================== | ||
| 109 | |||
| 110 | sysctl interface | ||
| 111 | ---------------- | ||
| 112 | |||
| 113 | .. kernel-doc:: kernel/sysctl.c | ||
| 114 | :export: | ||
| 115 | |||
| 116 | proc filesystem interface | ||
| 117 | ------------------------- | ||
| 118 | |||
| 119 | .. kernel-doc:: fs/proc/base.c | ||
| 120 | :internal: | ||
| 121 | |||
| 122 | Events based on file descriptors | ||
| 123 | ================================ | ||
| 124 | |||
| 125 | .. kernel-doc:: fs/eventfd.c | ||
| 126 | :export: | ||
| 127 | |||
| 128 | The Filesystem for Exporting Kernel Objects | ||
| 129 | =========================================== | ||
| 130 | |||
| 131 | .. kernel-doc:: fs/sysfs/file.c | ||
| 132 | :export: | ||
| 133 | |||
| 134 | .. kernel-doc:: fs/sysfs/symlink.c | ||
| 135 | :export: | ||
| 136 | |||
| 137 | The debugfs filesystem | ||
| 138 | ====================== | 11 | ====================== |
| 139 | 12 | ||
| 140 | debugfs interface | 13 | See these manuals for documentation about the VFS layer itself and how its |
| 141 | ----------------- | 14 | algorithms work. |
| 142 | 15 | ||
| 143 | .. kernel-doc:: fs/debugfs/inode.c | 16 | .. toctree:: |
| 144 | :export: | 17 | :maxdepth: 2 |
| 145 | 18 | ||
| 146 | .. kernel-doc:: fs/debugfs/file.c | 19 | path-lookup.rst |
| 147 | :export: | 20 | api-summary |
| 21 | splice | ||
| 148 | 22 | ||
| 149 | The Linux Journalling API | 23 | Filesystem support layers |
| 150 | ========================= | 24 | ========================= |
| 151 | 25 | ||
| 152 | Overview | 26 | Documentation for the support code within the filesystem layer for use in |
| 153 | -------- | 27 | filesystem implementations. |
| 154 | |||
| 155 | Details | ||
| 156 | ~~~~~~~ | ||
| 157 | |||
| 158 | The journalling layer is easy to use. You need to first of all create a | ||
| 159 | journal_t data structure. There are two calls to do this dependent on | ||
| 160 | how you decide to allocate the physical media on which the journal | ||
| 161 | resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in | ||
| 162 | filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used | ||
| 163 | for journal stored on a raw device (in a continuous range of blocks). A | ||
| 164 | journal_t is a typedef for a struct pointer, so when you are finally | ||
| 165 | finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up | ||
| 166 | any used kernel memory. | ||
| 167 | |||
| 168 | Once you have got your journal_t object you need to 'mount' or load the | ||
| 169 | journal file. The journalling layer expects the space for the journal | ||
| 170 | was already allocated and initialized properly by the userspace tools. | ||
| 171 | When loading the journal you must call :c:func:`jbd2_journal_load` to process | ||
| 172 | journal contents. If the client file system detects the journal contents | ||
| 173 | does not need to be processed (or even need not have valid contents), it | ||
| 174 | may call :c:func:`jbd2_journal_wipe` to clear the journal contents before | ||
| 175 | calling :c:func:`jbd2_journal_load`. | ||
| 176 | |||
| 177 | Note that jbd2_journal_wipe(..,0) calls | ||
| 178 | :c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding | ||
| 179 | transactions in the journal and similarly :c:func:`jbd2_journal_load` will | ||
| 180 | call :c:func:`jbd2_journal_recover` if necessary. I would advise reading | ||
| 181 | :c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage. | ||
| 182 | |||
| 183 | Now you can go ahead and start modifying the underlying filesystem. | ||
| 184 | Almost. | ||
| 185 | |||
| 186 | You still need to actually journal your filesystem changes, this is done | ||
| 187 | by wrapping them into transactions. Additionally you also need to wrap | ||
| 188 | the modification of each of the buffers with calls to the journal layer, | ||
| 189 | so it knows what the modifications you are actually making are. To do | ||
| 190 | this use :c:func:`jbd2_journal_start` which returns a transaction handle. | ||
| 191 | |||
| 192 | :c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`, | ||
| 193 | which indicates the end of a transaction are nestable calls, so you can | ||
| 194 | reenter a transaction if necessary, but remember you must call | ||
| 195 | :c:func:`jbd2_journal_stop` the same number of times as | ||
| 196 | :c:func:`jbd2_journal_start` before the transaction is completed (or more | ||
| 197 | accurately leaves the update phase). Ext4/VFS makes use of this feature to | ||
| 198 | simplify handling of inode dirtying, quota support, etc. | ||
| 199 | |||
| 200 | Inside each transaction you need to wrap the modifications to the | ||
| 201 | individual buffers (blocks). Before you start to modify a buffer you | ||
| 202 | need to call :c:func:`jbd2_journal_get_create_access()` / | ||
| 203 | :c:func:`jbd2_journal_get_write_access()` / | ||
| 204 | :c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the | ||
| 205 | journalling layer to copy the unmodified | ||
| 206 | data if it needs to. After all the buffer may be part of a previously | ||
| 207 | uncommitted transaction. At this point you are at last ready to modify a | ||
| 208 | buffer, and once you are have done so you need to call | ||
| 209 | :c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a | ||
| 210 | buffer you now know is now longer required to be pushed back on the | ||
| 211 | device you can call :c:func:`jbd2_journal_forget` in much the same way as you | ||
| 212 | might have used :c:func:`bforget` in the past. | ||
| 213 | |||
| 214 | A :c:func:`jbd2_journal_flush` may be called at any time to commit and | ||
| 215 | checkpoint all your transactions. | ||
| 216 | |||
| 217 | Then at umount time , in your :c:func:`put_super` you can then call | ||
| 218 | :c:func:`jbd2_journal_destroy` to clean up your in-core journal object. | ||
| 219 | |||
| 220 | Unfortunately there a couple of ways the journal layer can cause a | ||
| 221 | deadlock. The first thing to note is that each task can only have a | ||
| 222 | single outstanding transaction at any one time, remember nothing commits | ||
| 223 | until the outermost :c:func:`jbd2_journal_stop`. This means you must complete | ||
| 224 | the transaction at the end of each file/inode/address etc. operation you | ||
| 225 | perform, so that the journalling system isn't re-entered on another | ||
| 226 | journal. Since transactions can't be nested/batched across differing | ||
| 227 | journals, and another filesystem other than yours (say ext4) may be | ||
| 228 | modified in a later syscall. | ||
| 229 | |||
| 230 | The second case to bear in mind is that :c:func:`jbd2_journal_start` can block | ||
| 231 | if there isn't enough space in the journal for your transaction (based | ||
| 232 | on the passed nblocks param) - when it blocks it merely(!) needs to wait | ||
| 233 | for transactions to complete and be committed from other tasks, so | ||
| 234 | essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid | ||
| 235 | deadlocks you must treat :c:func:`jbd2_journal_start` / | ||
| 236 | :c:func:`jbd2_journal_stop` as if they were semaphores and include them in | ||
| 237 | your semaphore ordering rules to prevent | ||
| 238 | deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking | ||
| 239 | behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as | ||
| 240 | easily as on :c:func:`jbd2_journal_start`. | ||
| 241 | |||
| 242 | Try to reserve the right number of blocks the first time. ;-). This will | ||
| 243 | be the maximum number of blocks you are going to touch in this | ||
| 244 | transaction. I advise having a look at at least ext4_jbd.h to see the | ||
| 245 | basis on which ext4 uses to make these decisions. | ||
| 246 | |||
| 247 | Another wriggle to watch out for is your on-disk block allocation | ||
| 248 | strategy. Why? Because, if you do a delete, you need to ensure you | ||
| 249 | haven't reused any of the freed blocks until the transaction freeing | ||
| 250 | these blocks commits. If you reused these blocks and crash happens, | ||
| 251 | there is no way to restore the contents of the reallocated blocks at the | ||
| 252 | end of the last fully committed transaction. One simple way of doing | ||
| 253 | this is to mark blocks as free in internal in-memory block allocation | ||
| 254 | structures only after the transaction freeing them commits. Ext4 uses | ||
| 255 | journal commit callback for this purpose. | ||
| 256 | |||
| 257 | With journal commit callbacks you can ask the journalling layer to call | ||
| 258 | a callback function when the transaction is finally committed to disk, | ||
| 259 | so that you can do some of your own management. You ask the journalling | ||
| 260 | layer for calling the callback by simply setting | ||
| 261 | ``journal->j_commit_callback`` function pointer and that function is | ||
| 262 | called after each transaction commit. You can also use | ||
| 263 | ``transaction->t_private_list`` for attaching entries to a transaction | ||
| 264 | that need processing when the transaction commits. | ||
| 265 | |||
| 266 | JBD2 also provides a way to block all transaction updates via | ||
| 267 | :c:func:`jbd2_journal_lock_updates()` / | ||
| 268 | :c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a | ||
| 269 | window with a clean and stable fs for a moment. E.g. | ||
| 270 | |||
| 271 | :: | ||
| 272 | |||
| 273 | |||
| 274 | jbd2_journal_lock_updates() //stop new stuff happening.. | ||
| 275 | jbd2_journal_flush() // checkpoint everything. | ||
| 276 | ..do stuff on stable fs | ||
| 277 | jbd2_journal_unlock_updates() // carry on with filesystem use. | ||
| 278 | |||
| 279 | The opportunities for abuse and DOS attacks with this should be obvious, | ||
| 280 | if you allow unprivileged userspace to trigger codepaths containing | ||
| 281 | these calls. | ||
| 282 | |||
| 283 | Summary | ||
| 284 | ~~~~~~~ | ||
| 285 | |||
| 286 | Using the journal is a matter of wrapping the different context changes, | ||
| 287 | being each mount, each modification (transaction) and each changed | ||
| 288 | buffer to tell the journalling layer about them. | ||
| 289 | |||
| 290 | Data Types | ||
| 291 | ---------- | ||
| 292 | |||
| 293 | The journalling layer uses typedefs to 'hide' the concrete definitions | ||
| 294 | of the structures used. As a client of the JBD2 layer you can just rely | ||
| 295 | on the using the pointer as a magic cookie of some sort. Obviously the | ||
| 296 | hiding is not enforced as this is 'C'. | ||
| 297 | |||
| 298 | Structures | ||
| 299 | ~~~~~~~~~~ | ||
| 300 | |||
| 301 | .. kernel-doc:: include/linux/jbd2.h | ||
| 302 | :internal: | ||
| 303 | |||
| 304 | Functions | ||
| 305 | --------- | ||
| 306 | |||
| 307 | The functions here are split into two groups those that affect a journal | ||
| 308 | as a whole, and those which are used to manage transactions | ||
| 309 | |||
| 310 | Journal Level | ||
| 311 | ~~~~~~~~~~~~~ | ||
| 312 | |||
| 313 | .. kernel-doc:: fs/jbd2/journal.c | ||
| 314 | :export: | ||
| 315 | |||
| 316 | .. kernel-doc:: fs/jbd2/recovery.c | ||
| 317 | :internal: | ||
| 318 | |||
| 319 | Transasction Level | ||
| 320 | ~~~~~~~~~~~~~~~~~~ | ||
| 321 | |||
| 322 | .. kernel-doc:: fs/jbd2/transaction.c | ||
| 323 | |||
| 324 | See also | ||
| 325 | -------- | ||
| 326 | |||
| 327 | `Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen | ||
| 328 | Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__ | ||
| 329 | |||
| 330 | `Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen | ||
| 331 | Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__ | ||
| 332 | |||
| 333 | splice API | ||
| 334 | ========== | ||
| 335 | |||
| 336 | splice is a method for moving blocks of data around inside the kernel, | ||
| 337 | without continually transferring them between the kernel and user space. | ||
| 338 | |||
| 339 | .. kernel-doc:: fs/splice.c | ||
| 340 | |||
| 341 | pipes API | ||
| 342 | ========= | ||
| 343 | |||
| 344 | Pipe interfaces are all for in-kernel (builtin image) use. They are not | ||
| 345 | exported for use by modules. | ||
| 346 | |||
| 347 | .. kernel-doc:: include/linux/pipe_fs_i.h | ||
| 348 | :internal: | ||
| 349 | |||
| 350 | .. kernel-doc:: fs/pipe.c | ||
| 351 | |||
| 352 | Encryption API | ||
| 353 | ============== | ||
| 354 | |||
| 355 | A library which filesystems can hook into to support transparent | ||
| 356 | encryption of files and directories. | ||
| 357 | 28 | ||
| 358 | .. toctree:: | 29 | .. toctree:: |
| 359 | :maxdepth: 2 | 30 | :maxdepth: 2 |
| 360 | |||
| 361 | fscrypt | ||
| 362 | |||
| 363 | Pathname lookup | ||
| 364 | =============== | ||
| 365 | |||
| 366 | |||
| 367 | This write-up is based on three articles published at lwn.net: | ||
| 368 | 31 | ||
| 369 | - <https://lwn.net/Articles/649115/> Pathname lookup in Linux | 32 | journalling |
| 370 | - <https://lwn.net/Articles/649729/> RCU-walk: faster pathname lookup in Linux | 33 | fscrypt |
| 371 | - <https://lwn.net/Articles/650786/> A walk among the symlinks | ||
| 372 | 34 | ||
| 373 | Written by Neil Brown with help from Al Viro and Jon Corbet. | 35 | Filesystem-specific documentation |
| 374 | It has subsequently been updated to reflect changes in the kernel | 36 | ================================= |
| 375 | including: | ||
| 376 | 37 | ||
| 377 | - per-directory parallel name lookup. | 38 | Documentation for individual filesystem types can be found here. |
| 378 | 39 | ||
| 379 | .. toctree:: | 40 | .. toctree:: |
| 380 | :maxdepth: 2 | 41 | :maxdepth: 2 |
| 381 | 42 | ||
| 382 | path-lookup.rst | ||
| 383 | |||
| 384 | binderfs | ||
| 385 | ======== | ||
| 386 | |||
| 387 | .. toctree:: | ||
| 388 | |||
| 389 | binderfs.rst | 43 | binderfs.rst |
diff --git a/Documentation/filesystems/journalling.rst b/Documentation/filesystems/journalling.rst new file mode 100644 index 000000000000..58ce6b395206 --- /dev/null +++ b/Documentation/filesystems/journalling.rst | |||
| @@ -0,0 +1,184 @@ | |||
| 1 | The Linux Journalling API | ||
| 2 | ========================= | ||
| 3 | |||
| 4 | Overview | ||
| 5 | -------- | ||
| 6 | |||
| 7 | Details | ||
| 8 | ~~~~~~~ | ||
| 9 | |||
| 10 | The journalling layer is easy to use. You need to first of all create a | ||
| 11 | journal_t data structure. There are two calls to do this dependent on | ||
| 12 | how you decide to allocate the physical media on which the journal | ||
| 13 | resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in | ||
| 14 | filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used | ||
| 15 | for journal stored on a raw device (in a continuous range of blocks). A | ||
| 16 | journal_t is a typedef for a struct pointer, so when you are finally | ||
| 17 | finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up | ||
| 18 | any used kernel memory. | ||
| 19 | |||
| 20 | Once you have got your journal_t object you need to 'mount' or load the | ||
| 21 | journal file. The journalling layer expects the space for the journal | ||
| 22 | was already allocated and initialized properly by the userspace tools. | ||
| 23 | When loading the journal you must call :c:func:`jbd2_journal_load` to process | ||
| 24 | journal contents. If the client file system detects the journal contents | ||
| 25 | does not need to be processed (or even need not have valid contents), it | ||
| 26 | may call :c:func:`jbd2_journal_wipe` to clear the journal contents before | ||
| 27 | calling :c:func:`jbd2_journal_load`. | ||
| 28 | |||
| 29 | Note that jbd2_journal_wipe(..,0) calls | ||
| 30 | :c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding | ||
| 31 | transactions in the journal and similarly :c:func:`jbd2_journal_load` will | ||
| 32 | call :c:func:`jbd2_journal_recover` if necessary. I would advise reading | ||
| 33 | :c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage. | ||
| 34 | |||
| 35 | Now you can go ahead and start modifying the underlying filesystem. | ||
| 36 | Almost. | ||
| 37 | |||
| 38 | You still need to actually journal your filesystem changes, this is done | ||
| 39 | by wrapping them into transactions. Additionally you also need to wrap | ||
| 40 | the modification of each of the buffers with calls to the journal layer, | ||
| 41 | so it knows what the modifications you are actually making are. To do | ||
| 42 | this use :c:func:`jbd2_journal_start` which returns a transaction handle. | ||
| 43 | |||
| 44 | :c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`, | ||
| 45 | which indicates the end of a transaction are nestable calls, so you can | ||
| 46 | reenter a transaction if necessary, but remember you must call | ||
| 47 | :c:func:`jbd2_journal_stop` the same number of times as | ||
| 48 | :c:func:`jbd2_journal_start` before the transaction is completed (or more | ||
| 49 | accurately leaves the update phase). Ext4/VFS makes use of this feature to | ||
| 50 | simplify handling of inode dirtying, quota support, etc. | ||
| 51 | |||
| 52 | Inside each transaction you need to wrap the modifications to the | ||
| 53 | individual buffers (blocks). Before you start to modify a buffer you | ||
| 54 | need to call :c:func:`jbd2_journal_get_create_access()` / | ||
| 55 | :c:func:`jbd2_journal_get_write_access()` / | ||
| 56 | :c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the | ||
| 57 | journalling layer to copy the unmodified | ||
| 58 | data if it needs to. After all the buffer may be part of a previously | ||
| 59 | uncommitted transaction. At this point you are at last ready to modify a | ||
| 60 | buffer, and once you are have done so you need to call | ||
| 61 | :c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a | ||
| 62 | buffer you now know is now longer required to be pushed back on the | ||
| 63 | device you can call :c:func:`jbd2_journal_forget` in much the same way as you | ||
| 64 | might have used :c:func:`bforget` in the past. | ||
| 65 | |||
| 66 | A :c:func:`jbd2_journal_flush` may be called at any time to commit and | ||
| 67 | checkpoint all your transactions. | ||
| 68 | |||
| 69 | Then at umount time , in your :c:func:`put_super` you can then call | ||
| 70 | :c:func:`jbd2_journal_destroy` to clean up your in-core journal object. | ||
| 71 | |||
| 72 | Unfortunately there a couple of ways the journal layer can cause a | ||
| 73 | deadlock. The first thing to note is that each task can only have a | ||
| 74 | single outstanding transaction at any one time, remember nothing commits | ||
| 75 | until the outermost :c:func:`jbd2_journal_stop`. This means you must complete | ||
| 76 | the transaction at the end of each file/inode/address etc. operation you | ||
| 77 | perform, so that the journalling system isn't re-entered on another | ||
| 78 | journal. Since transactions can't be nested/batched across differing | ||
| 79 | journals, and another filesystem other than yours (say ext4) may be | ||
| 80 | modified in a later syscall. | ||
| 81 | |||
| 82 | The second case to bear in mind is that :c:func:`jbd2_journal_start` can block | ||
| 83 | if there isn't enough space in the journal for your transaction (based | ||
| 84 | on the passed nblocks param) - when it blocks it merely(!) needs to wait | ||
| 85 | for transactions to complete and be committed from other tasks, so | ||
| 86 | essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid | ||
| 87 | deadlocks you must treat :c:func:`jbd2_journal_start` / | ||
| 88 | :c:func:`jbd2_journal_stop` as if they were semaphores and include them in | ||
| 89 | your semaphore ordering rules to prevent | ||
| 90 | deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking | ||
| 91 | behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as | ||
| 92 | easily as on :c:func:`jbd2_journal_start`. | ||
| 93 | |||
| 94 | Try to reserve the right number of blocks the first time. ;-). This will | ||
| 95 | be the maximum number of blocks you are going to touch in this | ||
| 96 | transaction. I advise having a look at at least ext4_jbd.h to see the | ||
| 97 | basis on which ext4 uses to make these decisions. | ||
| 98 | |||
| 99 | Another wriggle to watch out for is your on-disk block allocation | ||
| 100 | strategy. Why? Because, if you do a delete, you need to ensure you | ||
| 101 | haven't reused any of the freed blocks until the transaction freeing | ||
| 102 | these blocks commits. If you reused these blocks and crash happens, | ||
| 103 | there is no way to restore the contents of the reallocated blocks at the | ||
| 104 | end of the last fully committed transaction. One simple way of doing | ||
| 105 | this is to mark blocks as free in internal in-memory block allocation | ||
| 106 | structures only after the transaction freeing them commits. Ext4 uses | ||
| 107 | journal commit callback for this purpose. | ||
| 108 | |||
| 109 | With journal commit callbacks you can ask the journalling layer to call | ||
| 110 | a callback function when the transaction is finally committed to disk, | ||
| 111 | so that you can do some of your own management. You ask the journalling | ||
| 112 | layer for calling the callback by simply setting | ||
| 113 | ``journal->j_commit_callback`` function pointer and that function is | ||
| 114 | called after each transaction commit. You can also use | ||
| 115 | ``transaction->t_private_list`` for attaching entries to a transaction | ||
| 116 | that need processing when the transaction commits. | ||
| 117 | |||
| 118 | JBD2 also provides a way to block all transaction updates via | ||
| 119 | :c:func:`jbd2_journal_lock_updates()` / | ||
| 120 | :c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a | ||
| 121 | window with a clean and stable fs for a moment. E.g. | ||
| 122 | |||
| 123 | :: | ||
| 124 | |||
| 125 | |||
| 126 | jbd2_journal_lock_updates() //stop new stuff happening.. | ||
| 127 | jbd2_journal_flush() // checkpoint everything. | ||
| 128 | ..do stuff on stable fs | ||
| 129 | jbd2_journal_unlock_updates() // carry on with filesystem use. | ||
| 130 | |||
| 131 | The opportunities for abuse and DOS attacks with this should be obvious, | ||
| 132 | if you allow unprivileged userspace to trigger codepaths containing | ||
| 133 | these calls. | ||
| 134 | |||
| 135 | Summary | ||
| 136 | ~~~~~~~ | ||
| 137 | |||
| 138 | Using the journal is a matter of wrapping the different context changes, | ||
| 139 | being each mount, each modification (transaction) and each changed | ||
| 140 | buffer to tell the journalling layer about them. | ||
| 141 | |||
| 142 | Data Types | ||
| 143 | ---------- | ||
| 144 | |||
| 145 | The journalling layer uses typedefs to 'hide' the concrete definitions | ||
| 146 | of the structures used. As a client of the JBD2 layer you can just rely | ||
| 147 | on the using the pointer as a magic cookie of some sort. Obviously the | ||
| 148 | hiding is not enforced as this is 'C'. | ||
| 149 | |||
| 150 | Structures | ||
| 151 | ~~~~~~~~~~ | ||
| 152 | |||
| 153 | .. kernel-doc:: include/linux/jbd2.h | ||
| 154 | :internal: | ||
| 155 | |||
| 156 | Functions | ||
| 157 | --------- | ||
| 158 | |||
| 159 | The functions here are split into two groups those that affect a journal | ||
| 160 | as a whole, and those which are used to manage transactions | ||
| 161 | |||
| 162 | Journal Level | ||
| 163 | ~~~~~~~~~~~~~ | ||
| 164 | |||
| 165 | .. kernel-doc:: fs/jbd2/journal.c | ||
| 166 | :export: | ||
| 167 | |||
| 168 | .. kernel-doc:: fs/jbd2/recovery.c | ||
| 169 | :internal: | ||
| 170 | |||
| 171 | Transasction Level | ||
| 172 | ~~~~~~~~~~~~~~~~~~ | ||
| 173 | |||
| 174 | .. kernel-doc:: fs/jbd2/transaction.c | ||
| 175 | |||
| 176 | See also | ||
| 177 | -------- | ||
| 178 | |||
| 179 | `Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen | ||
| 180 | Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__ | ||
| 181 | |||
| 182 | `Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen | ||
| 183 | Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__ | ||
| 184 | |||
diff --git a/Documentation/filesystems/path-lookup.rst b/Documentation/filesystems/path-lookup.rst index 80e22eda4132..434a07b0002b 100644 --- a/Documentation/filesystems/path-lookup.rst +++ b/Documentation/filesystems/path-lookup.rst | |||
| @@ -1,3 +1,18 @@ | |||
| 1 | =============== | ||
| 2 | Pathname lookup | ||
| 3 | =============== | ||
| 4 | |||
| 5 | This write-up is based on three articles published at lwn.net: | ||
| 6 | |||
| 7 | - <https://lwn.net/Articles/649115/> Pathname lookup in Linux | ||
| 8 | - <https://lwn.net/Articles/649729/> RCU-walk: faster pathname lookup in Linux | ||
| 9 | - <https://lwn.net/Articles/650786/> A walk among the symlinks | ||
| 10 | |||
| 11 | Written by Neil Brown with help from Al Viro and Jon Corbet. | ||
| 12 | It has subsequently been updated to reflect changes in the kernel | ||
| 13 | including: | ||
| 14 | |||
| 15 | - per-directory parallel name lookup. | ||
| 1 | 16 | ||
| 2 | Introduction to pathname lookup | 17 | Introduction to pathname lookup |
| 3 | =============================== | 18 | =============================== |
diff --git a/Documentation/filesystems/splice.rst b/Documentation/filesystems/splice.rst new file mode 100644 index 000000000000..edd874808472 --- /dev/null +++ b/Documentation/filesystems/splice.rst | |||
| @@ -0,0 +1,22 @@ | |||
| 1 | ================ | ||
| 2 | splice and pipes | ||
| 3 | ================ | ||
| 4 | |||
| 5 | splice API | ||
| 6 | ========== | ||
| 7 | |||
| 8 | splice is a method for moving blocks of data around inside the kernel, | ||
| 9 | without continually transferring them between the kernel and user space. | ||
| 10 | |||
| 11 | .. kernel-doc:: fs/splice.c | ||
| 12 | |||
| 13 | pipes API | ||
| 14 | ========= | ||
| 15 | |||
| 16 | Pipe interfaces are all for in-kernel (builtin image) use. They are not | ||
| 17 | exported for use by modules. | ||
| 18 | |||
| 19 | .. kernel-doc:: include/linux/pipe_fs_i.h | ||
| 20 | :internal: | ||
| 21 | |||
| 22 | .. kernel-doc:: fs/pipe.c | ||
