aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX18
-rw-r--r--Documentation/filesystems/Locking18
-rw-r--r--Documentation/filesystems/Makefile8
-rw-r--r--Documentation/filesystems/caching/fscache.txt110
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt21
-rw-r--r--Documentation/filesystems/dentry-locking.txt3
-rw-r--r--Documentation/filesystems/dnotify.txt39
-rw-r--r--Documentation/filesystems/dnotify_test.c34
-rw-r--r--Documentation/filesystems/exofs.txt23
-rw-r--r--Documentation/filesystems/ext3.txt20
-rw-r--r--Documentation/filesystems/ext4.txt33
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--Documentation/filesystems/nfs/00-INDEX16
-rw-r--r--Documentation/filesystems/nfs/Exporting (renamed from Documentation/filesystems/Exporting)0
-rw-r--r--Documentation/filesystems/nfs/knfsd-stats.txt (renamed from Documentation/filesystems/knfsd-stats.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs-rdma.txt (renamed from Documentation/filesystems/nfs-rdma.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs.txt (renamed from Documentation/filesystems/nfs.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt (renamed from Documentation/filesystems/nfs41-server.txt)14
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt (renamed from Documentation/filesystems/nfsroot.txt)0
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt (renamed from Documentation/filesystems/rpc-cache.txt)0
-rw-r--r--Documentation/filesystems/nilfs2.txt12
-rw-r--r--Documentation/filesystems/ocfs2.txt6
-rw-r--r--Documentation/filesystems/porting2
-rw-r--r--Documentation/filesystems/proc.txt74
-rw-r--r--Documentation/filesystems/seq_file.txt4
-rw-r--r--Documentation/filesystems/sharedsubtree.txt16
-rw-r--r--Documentation/filesystems/sysfs.txt12
-rw-r--r--Documentation/filesystems/vfat.txt2
-rw-r--r--Documentation/filesystems/vfs.txt2
29 files changed, 609 insertions, 119 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f15621ee5599..3bae418c6ad3 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -1,7 +1,5 @@
100-INDEX 100-INDEX
2 - this file (info on some of the filesystems supported by linux). 2 - this file (info on some of the filesystems supported by linux).
3Exporting
4 - explanation of how to make filesystems exportable.
5Locking 3Locking
6 - info on locking rules as they pertain to Linux VFS. 4 - info on locking rules as they pertain to Linux VFS.
79p.txt 59p.txt
@@ -34,8 +32,12 @@ dlmfs.txt
34 - info on the userspace interface to the OCFS2 DLM. 32 - info on the userspace interface to the OCFS2 DLM.
35dnotify.txt 33dnotify.txt
36 - info about directory notification in Linux. 34 - info about directory notification in Linux.
35dnotify_test.c
36 - example program for dnotify
37ecryptfs.txt 37ecryptfs.txt
38 - docs on eCryptfs: stacked cryptographic filesystem for Linux. 38 - docs on eCryptfs: stacked cryptographic filesystem for Linux.
39exofs.txt
40 - info, usage, mount options, design about EXOFS.
39ext2.txt 41ext2.txt
40 - info, mount options and specifications for the Ext2 filesystem. 42 - info, mount options and specifications for the Ext2 filesystem.
41ext3.txt 43ext3.txt
@@ -62,16 +64,14 @@ jfs.txt
62 - info and mount options for the JFS filesystem. 64 - info and mount options for the JFS filesystem.
63locks.txt 65locks.txt
64 - info on file locking implementations, flock() vs. fcntl(), etc. 66 - info on file locking implementations, flock() vs. fcntl(), etc.
67logfs.txt
68 - info on the LogFS flash filesystem.
65mandatory-locking.txt 69mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 70 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 71ncpfs.txt
68 - info on Novell Netware(tm) filesystem using NCP protocol. 72 - info on Novell Netware(tm) filesystem using NCP protocol.
69nfs41-server.txt 73nfs/
70 - info on the Linux server implementation of NFSv4 minor version 1. 74 - nfs-related documentation.
71nfs-rdma.txt
72 - how to install and setup the Linux NFS/RDMA client and server software.
73nfsroot.txt
74 - short guide on setting up a diskless box with NFS root filesystem.
75nilfs2.txt 75nilfs2.txt
76 - info and mount options for the NILFS2 filesystem. 76 - info and mount options for the NILFS2 filesystem.
77ntfs.txt 77ntfs.txt
@@ -90,8 +90,6 @@ relay.txt
90 - info on relay, for efficient streaming from kernel to user space. 90 - info on relay, for efficient streaming from kernel to user space.
91romfs.txt 91romfs.txt
92 - description of the ROMFS filesystem. 92 - description of the ROMFS filesystem.
93rpc-cache.txt
94 - introduction to the caching mechanisms in the sunrpc layer.
95seq_file.txt 93seq_file.txt
96 - how to use the seq_file API 94 - how to use the seq_file API
97sharedsubtree.txt 95sharedsubtree.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca0630..06bbbed71206 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
460 460
461--------------------------- dquot_operations ------------------------------- 461--------------------------- dquot_operations -------------------------------
462prototypes: 462prototypes:
463 int (*initialize) (struct inode *, int);
464 int (*drop) (struct inode *);
465 int (*alloc_space) (struct inode *, qsize_t, int);
466 int (*alloc_inode) (const struct inode *, unsigned long);
467 int (*free_space) (struct inode *, qsize_t);
468 int (*free_inode) (const struct inode *, unsigned long);
469 int (*transfer) (struct inode *, struct iattr *);
470 int (*write_dquot) (struct dquot *); 463 int (*write_dquot) (struct dquot *);
471 int (*acquire_dquot) (struct dquot *); 464 int (*acquire_dquot) (struct dquot *);
472 int (*release_dquot) (struct dquot *); 465 int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
479What filesystem should expect from the generic quota functions: 472What filesystem should expect from the generic quota functions:
480 473
481 FS recursion Held locks when called 474 FS recursion Held locks when called
482initialize: yes maybe dqonoff_sem
483drop: yes -
484alloc_space: ->mark_dirty() -
485alloc_inode: ->mark_dirty() -
486free_space: ->mark_dirty() -
487free_inode: ->mark_dirty() -
488transfer: yes -
489write_dquot: yes dqonoff_sem or dqptr_sem 475write_dquot: yes dqonoff_sem or dqptr_sem
490acquire_dquot: yes dqonoff_sem or dqptr_sem 476acquire_dquot: yes dqonoff_sem or dqptr_sem
491release_dquot: yes dqonoff_sem or dqptr_sem 477release_dquot: yes dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info: yes dqonoff_sem
495FS recursion means calling ->quota_read() and ->quota_write() from superblock 481FS recursion means calling ->quota_read() and ->quota_write() from superblock
496operations. 482operations.
497 483
498->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
499only directly by the filesystem and do not call any fs functions only
500the ->mark_dirty() operation.
501
502More details about quota locking can be found in fs/dquot.c. 484More details about quota locking can be found in fs/dquot.c.
503 485
504--------------------------- vm_operations_struct ----------------------------- 486--------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/Makefile b/Documentation/filesystems/Makefile
new file mode 100644
index 000000000000..a5dd114da14f
--- /dev/null
+++ b/Documentation/filesystems/Makefile
@@ -0,0 +1,8 @@
1# kbuild trick to avoid linker error. Can be omitted if a module is built.
2obj- := dummy.o
3
4# List of programs to build
5hostprogs-y := dnotify_test
6
7# Tell kbuild to always build the programs
8always := $(hostprogs-y)
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 9e94b9491d89..a91e2e2095b0 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -235,6 +235,7 @@ proc files.
235 neg=N Number of negative lookups made 235 neg=N Number of negative lookups made
236 pos=N Number of positive lookups made 236 pos=N Number of positive lookups made
237 crt=N Number of objects created by lookup 237 crt=N Number of objects created by lookup
238 tmo=N Number of lookups timed out and requeued
238 Updates n=N Number of update cookie requests seen 239 Updates n=N Number of update cookie requests seen
239 nul=N Number of upd reqs given a NULL parent 240 nul=N Number of upd reqs given a NULL parent
240 run=N Number of upd reqs granted CPU time 241 run=N Number of upd reqs granted CPU time
@@ -250,8 +251,10 @@ proc files.
250 ok=N Number of successful alloc reqs 251 ok=N Number of successful alloc reqs
251 wt=N Number of alloc reqs that waited on lookup completion 252 wt=N Number of alloc reqs that waited on lookup completion
252 nbf=N Number of alloc reqs rejected -ENOBUFS 253 nbf=N Number of alloc reqs rejected -ENOBUFS
254 int=N Number of alloc reqs aborted -ERESTARTSYS
253 ops=N Number of alloc reqs submitted 255 ops=N Number of alloc reqs submitted
254 owt=N Number of alloc reqs waited for CPU time 256 owt=N Number of alloc reqs waited for CPU time
257 abt=N Number of alloc reqs aborted due to object death
255 Retrvls n=N Number of retrieval (read) requests seen 258 Retrvls n=N Number of retrieval (read) requests seen
256 ok=N Number of successful retr reqs 259 ok=N Number of successful retr reqs
257 wt=N Number of retr reqs that waited on lookup completion 260 wt=N Number of retr reqs that waited on lookup completion
@@ -261,6 +264,7 @@ proc files.
261 oom=N Number of retr reqs failed -ENOMEM 264 oom=N Number of retr reqs failed -ENOMEM
262 ops=N Number of retr reqs submitted 265 ops=N Number of retr reqs submitted
263 owt=N Number of retr reqs waited for CPU time 266 owt=N Number of retr reqs waited for CPU time
267 abt=N Number of retr reqs aborted due to object death
264 Stores n=N Number of storage (write) requests seen 268 Stores n=N Number of storage (write) requests seen
265 ok=N Number of successful store reqs 269 ok=N Number of successful store reqs
266 agn=N Number of store reqs on a page already pending storage 270 agn=N Number of store reqs on a page already pending storage
@@ -268,12 +272,37 @@ proc files.
268 oom=N Number of store reqs failed -ENOMEM 272 oom=N Number of store reqs failed -ENOMEM
269 ops=N Number of store reqs submitted 273 ops=N Number of store reqs submitted
270 run=N Number of store reqs granted CPU time 274 run=N Number of store reqs granted CPU time
275 pgs=N Number of pages given store req processing time
276 rxd=N Number of store reqs deleted from tracking tree
277 olm=N Number of store reqs over store limit
278 VmScan nos=N Number of release reqs against pages with no pending store
279 gon=N Number of release reqs against pages stored by time lock granted
280 bsy=N Number of release reqs ignored due to in-progress store
281 can=N Number of page stores cancelled due to release req
271 Ops pend=N Number of times async ops added to pending queues 282 Ops pend=N Number of times async ops added to pending queues
272 run=N Number of times async ops given CPU time 283 run=N Number of times async ops given CPU time
273 enq=N Number of times async ops queued for processing 284 enq=N Number of times async ops queued for processing
285 can=N Number of async ops cancelled
286 rej=N Number of async ops rejected due to object lookup/create failure
274 dfr=N Number of async ops queued for deferred release 287 dfr=N Number of async ops queued for deferred release
275 rel=N Number of async ops released 288 rel=N Number of async ops released
276 gc=N Number of deferred-release async ops garbage collected 289 gc=N Number of deferred-release async ops garbage collected
290 CacheOp alo=N Number of in-progress alloc_object() cache ops
291 luo=N Number of in-progress lookup_object() cache ops
292 luc=N Number of in-progress lookup_complete() cache ops
293 gro=N Number of in-progress grab_object() cache ops
294 upo=N Number of in-progress update_object() cache ops
295 dro=N Number of in-progress drop_object() cache ops
296 pto=N Number of in-progress put_object() cache ops
297 syn=N Number of in-progress sync_cache() cache ops
298 atc=N Number of in-progress attr_changed() cache ops
299 rap=N Number of in-progress read_or_alloc_page() cache ops
300 ras=N Number of in-progress read_or_alloc_pages() cache ops
301 alp=N Number of in-progress allocate_page() cache ops
302 als=N Number of in-progress allocate_pages() cache ops
303 wrp=N Number of in-progress write_page() cache ops
304 ucp=N Number of in-progress uncache_page() cache ops
305 dsp=N Number of in-progress dissociate_pages() cache ops
277 306
278 307
279 (*) /proc/fs/fscache/histogram 308 (*) /proc/fs/fscache/histogram
@@ -299,6 +328,87 @@ proc files.
299 jiffy range covered, and the SECS field the equivalent number of seconds. 328 jiffy range covered, and the SECS field the equivalent number of seconds.
300 329
301 330
331===========
332OBJECT LIST
333===========
334
335If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
336list of all the objects currently allocated and allow them to be viewed
337through:
338
339 /proc/fs/fscache/objects
340
341This will look something like:
342
343 [root@andromeda ~]# head /proc/fs/fscache/objects
344 OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA
345 ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
346 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
347 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
348
349where the first set of columns before the '|' describe the object:
350
351 COLUMN DESCRIPTION
352 ======= ===============================================================
353 OBJECT Object debugging ID (appears as OBJ%x in some debug messages)
354 PARENT Debugging ID of parent object
355 STAT Object state
356 CHLDN Number of child objects of this object
357 OPS Number of outstanding operations on this object
358 OOP Number of outstanding child object management operations
359 IPR
360 EX Number of outstanding exclusive operations
361 READS Number of outstanding read operations
362 EM Object's event mask
363 EV Events raised on this object
364 F Object flags
365 S Object slow-work work item flags
366
367and the second set of columns describe the object's cookie, if present:
368
369 COLUMN DESCRIPTION
370 =============== =======================================================
371 NETFS_COOKIE_DEF Name of netfs cookie definition
372 TY Cookie type (IX - index, DT - data, hex - special)
373 FL Cookie flags
374 NETFS_DATA Netfs private data stored in the cookie
375 OBJECT_KEY Object key } 1 column, with separating comma
376 AUX_DATA Object aux data } presence may be configured
377
378The data shown may be filtered by attaching the a key to an appropriate keyring
379before viewing the file. Something like:
380
381 keyctl add user fscache:objlist <restrictions> @s
382
383where <restrictions> are a selection of the following letters:
384
385 K Show hexdump of object key (don't show if not given)
386 A Show hexdump of object aux data (don't show if not given)
387
388and the following paired letters:
389
390 C Show objects that have a cookie
391 c Show objects that don't have a cookie
392 B Show objects that are busy
393 b Show objects that aren't busy
394 W Show objects that have pending writes
395 w Show objects that don't have pending writes
396 R Show objects that have outstanding reads
397 r Show objects that don't have outstanding reads
398 S Show objects that have slow work queued
399 s Show objects that don't have slow work queued
400
401If neither side of a letter pair is given, then both are implied. For example:
402
403 keyctl add user fscache:objlist KB @s
404
405shows objects that are busy, and lists their object keys, but does not dump
406their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is
407not implied.
408
409By default all objects and all fields will be shown.
410
411
302========= 412=========
303DEBUGGING 413DEBUGGING
304========= 414=========
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 2666b1ed5e9e..1902c57b72ef 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
641 641
642Furthermore, note that this does not cancel the asynchronous read or write 642Furthermore, note that this does not cancel the asynchronous read or write
643operation started by the read/alloc and write functions, so the page 643operation started by the read/alloc and write functions, so the page
644invalidation and release functions must use: 644invalidation functions must use:
645 645
646 bool fscache_check_page_write(struct fscache_cookie *cookie, 646 bool fscache_check_page_write(struct fscache_cookie *cookie,
647 struct page *page); 647 struct page *page);
@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
654to wait for it to finish if it is. 654to wait for it to finish if it is.
655 655
656 656
657When releasepage() is being implemented, a special FS-Cache function exists to
658manage the heuristics of coping with vmscan trying to eject pages, which may
659conflict with the cache trying to write pages to the cache (which may itself
660need to allocate memory):
661
662 bool fscache_maybe_release_page(struct fscache_cookie *cookie,
663 struct page *page,
664 gfp_t gfp);
665
666This takes the netfs cookie, and the page and gfp arguments as supplied to
667releasepage(). It will return false if the page cannot be released yet for
668some reason and if it returns true, the page has been uncached and can now be
669released.
670
671To make a page available for release, this function may wait for an outstanding
672storage request to complete, or it may attempt to cancel the storage request -
673in which case the page will not be stored in the cache this time.
674
675
657========================== 676==========================
658INDEX AND DATA FILE UPDATE 677INDEX AND DATA FILE UPDATE
659========================== 678==========================
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 4c0c575a4012..79334ed5daa7 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -62,7 +62,8 @@ changes are :
622. Insertion of a dentry into the hash table is done using 622. Insertion of a dentry into the hash table is done using
63 hlist_add_head_rcu() which take care of ordering the writes - the 63 hlist_add_head_rcu() which take care of ordering the writes - the
64 writes to the dentry must be visible before the dentry is 64 writes to the dentry must be visible before the dentry is
65 inserted. This works in conjunction with hlist_for_each_rcu() while 65 inserted. This works in conjunction with hlist_for_each_rcu(),
66 which has since been replaced by hlist_for_each_entry_rcu(), while
66 walking the hash chain. The only requirement is that all 67 walking the hash chain. The only requirement is that all
67 initialization to the dentry must be done before 68 initialization to the dentry must be done before
68 hlist_add_head_rcu() since we don't have dcache_lock protection 69 hlist_add_head_rcu() since we don't have dcache_lock protection
diff --git a/Documentation/filesystems/dnotify.txt b/Documentation/filesystems/dnotify.txt
index 9f5d338ddbb8..6baf88f46859 100644
--- a/Documentation/filesystems/dnotify.txt
+++ b/Documentation/filesystems/dnotify.txt
@@ -62,38 +62,9 @@ disabled, fcntl(fd, F_NOTIFY, ...) will return -EINVAL.
62 62
63Example 63Example
64------- 64-------
65See Documentation/filesystems/dnotify_test.c for an example.
65 66
66 #define _GNU_SOURCE /* needed to get the defines */ 67NOTE
67 #include <fcntl.h> /* in glibc 2.2 this has the needed 68----
68 values defined */ 69Beginning with Linux 2.6.13, dnotify has been replaced by inotify.
69 #include <signal.h> 70See Documentation/filesystems/inotify.txt for more information on it.
70 #include <stdio.h>
71 #include <unistd.h>
72
73 static volatile int event_fd;
74
75 static void handler(int sig, siginfo_t *si, void *data)
76 {
77 event_fd = si->si_fd;
78 }
79
80 int main(void)
81 {
82 struct sigaction act;
83 int fd;
84
85 act.sa_sigaction = handler;
86 sigemptyset(&act.sa_mask);
87 act.sa_flags = SA_SIGINFO;
88 sigaction(SIGRTMIN + 1, &act, NULL);
89
90 fd = open(".", O_RDONLY);
91 fcntl(fd, F_SETSIG, SIGRTMIN + 1);
92 fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
93 /* we will now be notified if any of the files
94 in "." is modified or new files are created */
95 while (1) {
96 pause();
97 printf("Got event on fd=%d\n", event_fd);
98 }
99 }
diff --git a/Documentation/filesystems/dnotify_test.c b/Documentation/filesystems/dnotify_test.c
new file mode 100644
index 000000000000..8b37b4a1e18d
--- /dev/null
+++ b/Documentation/filesystems/dnotify_test.c
@@ -0,0 +1,34 @@
1#define _GNU_SOURCE /* needed to get the defines */
2#include <fcntl.h> /* in glibc 2.2 this has the needed
3 values defined */
4#include <signal.h>
5#include <stdio.h>
6#include <unistd.h>
7
8static volatile int event_fd;
9
10static void handler(int sig, siginfo_t *si, void *data)
11{
12 event_fd = si->si_fd;
13}
14
15int main(void)
16{
17 struct sigaction act;
18 int fd;
19
20 act.sa_sigaction = handler;
21 sigemptyset(&act.sa_mask);
22 act.sa_flags = SA_SIGINFO;
23 sigaction(SIGRTMIN + 1, &act, NULL);
24
25 fd = open(".", O_RDONLY);
26 fcntl(fd, F_SETSIG, SIGRTMIN + 1);
27 fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
28 /* we will now be notified if any of the files
29 in "." is modified or new files are created */
30 while (1) {
31 pause();
32 printf("Got event on fd=%d\n", event_fd);
33 }
34}
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
index 0ced74c2f73c..abd2a9b5b787 100644
--- a/Documentation/filesystems/exofs.txt
+++ b/Documentation/filesystems/exofs.txt
@@ -60,13 +60,13 @@ USAGE
60 60
61 mkfs.exofs --pid=65536 --format /dev/osd0 61 mkfs.exofs --pid=65536 --format /dev/osd0
62 62
63 The --format is optional if not specified no OSD_FORMAT will be 63 The --format is optional. If not specified, no OSD_FORMAT will be
64 preformed and a clean file system will be created in the specified pid, 64 performed and a clean file system will be created in the specified pid,
65 in the available space of the target. (Use --format=size_in_meg to limit 65 in the available space of the target. (Use --format=size_in_meg to limit
66 the total LUN space available) 66 the total LUN space available)
67 67
68 If pid already exist it will be deleted and a new one will be created in it's 68 If pid already exists, it will be deleted and a new one will be created in
69 place. Be careful. 69 its place. Be careful.
70 70
71 An exofs lives inside a single OSD partition. You can create multiple exofs 71 An exofs lives inside a single OSD partition. You can create multiple exofs
72 filesystems on the same device using multiple pids. 72 filesystems on the same device using multiple pids.
@@ -81,7 +81,7 @@ USAGE
81 81
827. For reference (See do-exofs example script): 827. For reference (See do-exofs example script):
83 do-exofs start - an example of how to perform the above steps. 83 do-exofs start - an example of how to perform the above steps.
84 do-exofs stop - an example of how to unmount the file system. 84 do-exofs stop - an example of how to unmount the file system.
85 do-exofs format - an example of how to format and mkfs a new exofs. 85 do-exofs format - an example of how to format and mkfs a new exofs.
86 86
878. Extra compilation flags (uncomment in fs/exofs/Kbuild): 878. Extra compilation flags (uncomment in fs/exofs/Kbuild):
@@ -104,8 +104,8 @@ Where:
104 exofs specific options: Options are separated by commas (,) 104 exofs specific options: Options are separated by commas (,)
105 pid=<integer> - The partition number to mount/create as 105 pid=<integer> - The partition number to mount/create as
106 container of the filesystem. 106 container of the filesystem.
107 This option is mandatory 107 This option is mandatory.
108 to=<integer> - Timeout in ticks for a single command 108 to=<integer> - Timeout in ticks for a single command.
109 default is (60 * HZ) [for debugging only] 109 default is (60 * HZ) [for debugging only]
110 110
111=============================================================================== 111===============================================================================
@@ -116,7 +116,7 @@ DESIGN
116 with a special ID (defined in common.h). 116 with a special ID (defined in common.h).
117 Information included in the file system control block is used to fill the 117 Information included in the file system control block is used to fill the
118 in-memory superblock structure at mount time. This object is created before 118 in-memory superblock structure at mount time. This object is created before
119 the file system is used by mkexofs.c It contains information such as: 119 the file system is used by mkexofs.c. It contains information such as:
120 - The file system's magic number 120 - The file system's magic number
121 - The next inode number to be allocated 121 - The next inode number to be allocated
122 122
@@ -134,8 +134,8 @@ DESIGN
134 attributes. This applies to both regular files and other types (directories, 134 attributes. This applies to both regular files and other types (directories,
135 device files, symlinks, etc.). 135 device files, symlinks, etc.).
136 136
137* Credentials are generated per object (inode and superblock) when they is 137* Credentials are generated per object (inode and superblock) when they are
138 created in memory (read off disk or created). The credential works for all 138 created in memory (read from disk or created). The credential works for all
139 operations and is used as long as the object remains in memory. 139 operations and is used as long as the object remains in memory.
140 140
141* Async OSD operations are used whenever possible, but the target may execute 141* Async OSD operations are used whenever possible, but the target may execute
@@ -145,7 +145,8 @@ DESIGN
145 from executing in reverse order: 145 from executing in reverse order:
146 - The following are handled with the OBJ_CREATED and OBJ_2BCREATED 146 - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
147 flags. OBJ_CREATED is set when we know the object exists on the OSD - 147 flags. OBJ_CREATED is set when we know the object exists on the OSD -
148 in create's callback function, and when we successfully do a read_inode. 148 in create's callback function, and when we successfully do a
149 read_inode.
149 OBJ_2BCREATED is set in the beginning of the create function, so we 150 OBJ_2BCREATED is set in the beginning of the create function, so we
150 know that we should wait. 151 know that we should wait.
151 - create/delete: delete should wait until the object is created 152 - create/delete: delete should wait until the object is created
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 570f9bd9be2b..867c5b50cb42 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -32,8 +32,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
32 identified through its new major/minor numbers encoded 32 identified through its new major/minor numbers encoded
33 in devnum. 33 in devnum.
34 34
35noload Don't load the journal on mounting. Note that this forces 35norecovery Don't load the journal on mounting. Note that this forces
36 mount of inconsistent filesystem, which can lead to 36noload mount of inconsistent filesystem, which can lead to
37 various problems. 37 various problems.
38 38
39data=journal All data are committed into the journal prior to being 39data=journal All data are committed into the journal prior to being
@@ -123,10 +123,18 @@ resuid=n The user ID which may use the reserved blocks.
123 123
124sb=n Use alternate superblock at this location. 124sb=n Use alternate superblock at this location.
125 125
126quota 126quota These options are ignored by the filesystem. They
127noquota 127noquota are used only by quota tools to recognize volumes
128grpquota 128grpquota where quota should be turned on. See documentation
129usrquota 129usrquota in the quota-tools package for more details
130 (http://sourceforge.net/projects/linuxquota).
131
132jqfmt=<quota type> These options tell filesystem details about quota
133usrjquota=<file> so that quota information can be properly updated
134grpjquota=<file> during journal replay. They replace the above
135 quota options. See documentation in the quota-tools
136 package for more details
137 (http://sourceforge.net/projects/linuxquota).
130 138
131bh (*) ext3 associates buffer heads to data pages to 139bh (*) ext3 associates buffer heads to data pages to
132nobh (a) cache disk block mapping information 140nobh (a) cache disk block mapping information
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 18b5ec8cea45..e1def1786e50 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -134,9 +134,15 @@ ro Mount filesystem read only. Note that ext4 will
134 mount options "ro,noload" can be used to prevent 134 mount options "ro,noload" can be used to prevent
135 writes to the filesystem. 135 writes to the filesystem.
136 136
137journal_checksum Enable checksumming of the journal transactions.
138 This will allow the recovery code in e2fsck and the
139 kernel to detect corruption in the kernel. It is a
140 compatible change and will be ignored by older kernels.
141
137journal_async_commit Commit block can be written to disk without waiting 142journal_async_commit Commit block can be written to disk without waiting
138 for descriptor blocks. If enabled older kernels cannot 143 for descriptor blocks. If enabled older kernels cannot
139 mount the device. 144 mount the device. This will enable 'journal_checksum'
145 internally.
140 146
141journal=update Update the ext4 file system's journal to the current 147journal=update Update the ext4 file system's journal to the current
142 format. 148 format.
@@ -147,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
147 identified through its new major/minor numbers encoded 153 identified through its new major/minor numbers encoded
148 in devnum. 154 in devnum.
149 155
150noload Don't load the journal on mounting. Note that 156norecovery Don't load the journal on mounting. Note that
151 if the filesystem was not unmounted cleanly, 157noload if the filesystem was not unmounted cleanly,
152 skipping the journal replay will lead to the 158 skipping the journal replay will lead to the
153 filesystem containing inconsistencies that can 159 filesystem containing inconsistencies that can
154 lead to any number of problems. 160 lead to any number of problems.
@@ -190,7 +196,7 @@ nobarrier This also requires an IO stack which can support
190 also be used to enable or disable barriers, for 196 also be used to enable or disable barriers, for
191 consistency with other ext4 mount options. 197 consistency with other ext4 mount options.
192 198
193inode_readahead=n This tuning parameter controls the maximum 199inode_readahead_blks=n This tuning parameter controls the maximum
194 number of inode table blocks that ext4's inode 200 number of inode table blocks that ext4's inode
195 table readahead algorithm will pre-read into 201 table readahead algorithm will pre-read into
196 the buffer cache. The default value is 32 blocks. 202 the buffer cache. The default value is 32 blocks.
@@ -282,9 +288,16 @@ stripe=n Number of filesystem blocks that mballoc will try
282 to use for allocation size and alignment. For RAID5/6 288 to use for allocation size and alignment. For RAID5/6
283 systems this should be the number of data 289 systems this should be the number of data
284 disks * RAID chunk size in file system blocks. 290 disks * RAID chunk size in file system blocks.
285delalloc (*) Deferring block allocation until write-out time. 291
286nodelalloc Disable delayed allocation. Blocks are allocation 292delalloc (*) Defer block allocation until just before ext4
287 when data is copied from user to page cache. 293 writes out the block(s) in question. This
294 allows ext4 to better allocation decisions
295 more efficiently.
296nodelalloc Disable delayed allocation. Blocks are allocated
297 when the data is copied from userspace to the
298 page cache, either via the write(2) system call
299 or when an mmap'ed page which was previously
300 unallocated is written for the first time.
288 301
289max_batch_time=usec Maximum amount of time ext4 should wait for 302max_batch_time=usec Maximum amount of time ext4 should wait for
290 additional filesystem operations to be batch 303 additional filesystem operations to be batch
@@ -340,6 +353,12 @@ noauto_da_alloc replacing existing files via patterns such as
340 system crashes before the delayed allocation 353 system crashes before the delayed allocation
341 blocks are forced to disk. 354 blocks are forced to disk.
342 355
356discard Controls whether ext4 should issue discard/TRIM
357nodiscard(*) commands to the underlying block device when
358 blocks are freed. This is useful for SSD devices
359 and sparse/thinly-provisioned LUNs, but it is off
360 by default until sufficient testing has been done.
361
343Data Mode 362Data Mode
344========= 363=========
345There are 3 different data modes: 364There are 3 different data modes:
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 000000000000..e64c94ba401a
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
1
2The LogFS Flash Filesystem
3==========================
4
5Specification
6=============
7
8Superblocks
9-----------
10
11Two superblocks exist at the beginning and end of the filesystem.
12Each superblock is 256 Bytes large, with another 3840 Bytes reserved
13for future purposes, making a total of 4096 Bytes.
14
15Superblock locations may differ for MTD and block devices. On MTD the
16first non-bad block contains a superblock in the first 4096 Bytes and
17the last non-bad block contains a superblock in the last 4096 Bytes.
18On block devices, the first 4096 Bytes of the device contain the first
19superblock and the last aligned 4096 Byte-block contains the second
20superblock.
21
22For the most part, the superblocks can be considered read-only. They
23are written only to correct errors detected within the superblocks,
24move the journal and change the filesystem parameters through tunefs.
25As a result, the superblock does not contain any fields that require
26constant updates, like the amount of free space, etc.
27
28Segments
29--------
30
31The space in the device is split up into equal-sized segments.
32Segments are the primary write unit of LogFS. Within each segments,
33writes happen from front (low addresses) to back (high addresses. If
34only a partial segment has been written, the segment number, the
35current position within and optionally a write buffer are stored in
36the journal.
37
38Segments are erased as a whole. Therefore Garbage Collection may be
39required to completely free a segment before doing so.
40
41Journal
42--------
43
44The journal contains all global information about the filesystem that
45is subject to frequent change. At mount time, it has to be scanned
46for the most recent commit entry, which contains a list of pointers to
47all currently valid entries.
48
49Object Store
50------------
51
52All space except for the superblocks and journal is part of the object
53store. Each segment contains a segment header and a number of
54objects, each consisting of the object header and the payload.
55Objects are either inodes, directory entries (dentries), file data
56blocks or indirect blocks.
57
58Levels
59------
60
61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is seperated
63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
66respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks.
68
69Each segment contains objects of a single level only. As a result,
70each level requires its own seperate segment to be open for writing.
71
72Inode File
73----------
74
75All inodes are stored in a special file, the inode file. Single
76exception is the inode file's inode (master inode) which for obvious
77reasons is stored in the journal instead. Instead of data blocks, the
78leaf nodes of the inode files are inodes.
79
80Aliases
81-------
82
83Writes in LogFS are done by means of a wandering tree. A naïve
84implementation would require that for each write or a block, all
85parent blocks are written as well, since the block pointers have
86changed. Such an implementation would not be very efficient.
87
88In LogFS, the block pointer changes are cached in the journal by means
89of alias entries. Each alias consists of its logical address - inode
90number, block index, level and child number (index into block) - and
91the changed data. Any 8-byte word can be changes in this manner.
92
93Currently aliases are used for block pointers, file size, file used
94bytes and the height of an inodes indirect tree.
95
96Segment Aliases
97---------------
98
99Related to regular aliases, these are used to handle bad blocks.
100Initially, bad blocks are handled by moving the affected segment
101content to a spare segment and noting this move in the journal with a
102segment alias, a simple (to, from) tupel. GC will later empty this
103segment and the alias can be removed again. This is used on MTD only.
104
105Vim
106---
107
108By cleverly predicting the life time of data, it is possible to
109seperate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can
111have a seperate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth.
114
115Indirect Tree
116-------------
117
118Inodes in LogFS are similar to FFS-style filesystems with direct and
119indirect block pointers. One difference is that LogFS uses a single
120indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
121A height field in the inode defines the height of the indirect tree
122and thereby the indirection of the pointer.
123
124Another difference is the addressing of indirect blocks. In LogFS,
125the first 16 pointers in the first indirect block are left empty,
126corresponding to the 16 direct pointers in the inode. In ext2 (maybe
127others as well) the first pointer in the first indirect block
128corresponds to logical block 12, skipping the 12 direct pointers.
129So where ext2 is using arithmetic to better utilize space, LogFS keeps
130arithmetic simple and uses compression to save space.
131
132Compression
133-----------
134
135Both file data and metadata can be compressed. Compression for file
136data can be enabled with chattr +c and disabled with chattr -c. Doing
137so has no effect on existing data, but new data will be stored
138accordingly. New inodes will inherit the compression flag of the
139parent directory.
140
141Metadata is always compressed. However, the space accounting ignores
142this and charges for the uncompressed size. Failing to do so could
143result in GC failures when, after moving some data, indirect blocks
144compress worse than previously. Even on a 100% full medium, GC may
145not consume any extra space, so the compression gains are lost space
146to the user.
147
148However, they are not lost space to the filesystem internals. By
149cheating the user for those bytes, the filesystem gained some slack
150space and GC will run less often and faster.
151
152Garbage Collection and Wear Leveling
153------------------------------------
154
155Garbage collection is invoked whenever the number of free segments
156falls below a threshold. The best (known) candidate is picked based
157on the least amount of valid data contained in the segment. All
158remaining valid data is copied elsewhere, thereby invalidating it.
159
160The GC code also checks for aliases and writes then back if their
161number gets too large.
162
163Wear leveling is done by occasionally picking a suboptimal segment for
164garbage collection. If a stale segments erase count is significantly
165lower than the active segments' erase counts, it will be picked. Wear
166leveling is rate limited, so it will never monopolize the device for
167more than one segment worth at a time.
168
169Values for "occasionally", "significantly lower" are compile time
170constants.
171
172Hashed directories
173------------------
174
175To satisfy efficient lookup(), directory entries are hashed and
176located based on the hash. In order to both support large directories
177and not be overly inefficient for small directories, several hash
178tables of increasing size are used. For each table, the hash value
179modulo the table size gives the table index.
180
181Tables sizes are chosen to limit the number of indirect blocks with a
182fully populated table to 0, 1, 2 or 3 respectively. So the first
183table contains 16 entries, the second 512-16, etc.
184
185The last table is special in several ways. First its size depends on
186the effective 32bit limit on telldir/seekdir cookies. Since logfs
187uses the upper half of the address space for indirect blocks, the size
188is limited to 2^31. Secondly the table contains hash buckets with 16
189entries each.
190
191Using single-entry buckets would result in birthday "attacks". At
192just 2^16 used entries, hash collisions would be likely (P >= 0.5).
193My math skills are insufficient to do the combinatorics for the 17x
194collisions necessary to overflow a bucket, but testing showed that in
19510,000 runs the lowest directory fill before a bucket overflow was
196188,057,130 entries with an average of 315,149,915 entries. So for
197directory sizes of up to a million, bucket overflows should be
198virtually impossible under normal circumstances.
199
200With carefully chosen filenames, it is obviously possible to cause an
201overflow with just 21 entries (4 higher tables + 16 entries + 1). So
202there may be a security concern if a malicious user has write access
203to a directory.
204
205Open For Discussion
206===================
207
208Device Address Space
209--------------------
210
211A device address space is used for caching. Both block devices and
212MTD provide functions to either read a single page or write a segment.
213Partial segments may be written for data integrity, but where possible
214complete segments are written for performance on simple block device
215flash media.
216
217Meta Inodes
218-----------
219
220Inodes are stored in the inode file, which is just a regular file for
221most purposes. At umount time, however, the inode file needs to
222remain open until all dirty inodes are written. So
223generic_shutdown_super() may not close this inode, but shouldn't
224complain about remaining inodes due to the inode file either. Same
225goes for mapping inode of the device address space.
226
227Currently logfs uses a hack that essentially copies part of fs/inode.c
228code over. A general solution would be preferred.
229
230Indirect block mapping
231----------------------
232
233With compression, the block device (or mapping inode) cannot be used
234to cache indirect blocks. Some other place is required. Currently
235logfs uses the top half of each inode's address space. The low 8TB
236(on 32bit) are filled with file data, the high 8TB are used for
237indirect blocks.
238
239One problem is that 16TB files created on 64bit systems actually have
240data in the top 8TB. But files >16TB would cause problems anyway, so
241only the limit has changed.
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
new file mode 100644
index 000000000000..2f68cd688769
--- /dev/null
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -0,0 +1,16 @@
100-INDEX
2 - this file (nfs-related documentation).
3Exporting
4 - explanation of how to make filesystems exportable.
5knfsd-stats.txt
6 - statistics which the NFS server makes available to user space.
7nfs.txt
8 - nfs client, and DNS resolution for fs_locations.
9nfs41-server.txt
10 - info on the Linux server implementation of NFSv4 minor version 1.
11nfs-rdma.txt
12 - how to install and setup the Linux NFS/RDMA client and server software
13nfsroot.txt
14 - short guide on setting up a diskless box with NFS root filesystem.
15rpc-cache.txt
16 - introduction to the caching mechanisms in the sunrpc layer.
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/nfs/Exporting
index 87019d2b5981..87019d2b5981 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/nfs/knfsd-stats.txt
index 64ced5149d37..64ced5149d37 100644
--- a/Documentation/filesystems/knfsd-stats.txt
+++ b/Documentation/filesystems/nfs/knfsd-stats.txt
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e4bcee..e386f7e4bcee 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs/nfs.txt
index f50f26ce6cd0..f50f26ce6cd0 100644
--- a/Documentation/filesystems/nfs.txt
+++ b/Documentation/filesystems/nfs/nfs.txt
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 5920fe26e6ff..6a53a84afc72 100644
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
17on or off; rpc.nfsd does this correctly.) 17on or off; rpc.nfsd does this correctly.)
18 18
19The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based 19The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
20on the latest NFSv4.1 Internet Draft: 20on RFC 5661.
21http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
22 21
23From the many new features in NFSv4.1 the current implementation 22From the many new features in NFSv4.1 the current implementation
24focuses on the mandatory-to-implement NFSv4.1 Sessions, providing 23focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -41,10 +40,10 @@ interoperability problems with future clients. Known issues:
41 conformant with the spec (for example, we don't use kerberos 40 conformant with the spec (for example, we don't use kerberos
42 on the backchannel correctly). 41 on the backchannel correctly).
43 - no trunking support: no clients currently take advantage of 42 - no trunking support: no clients currently take advantage of
44 trunking, but this is a mandatory failure, and its use is 43 trunking, but this is a mandatory feature, and its use is
45 recommended to clients in a number of places. (E.g. to ensure 44 recommended to clients in a number of places. (E.g. to ensure
46 timely renewal in case an existing connection's retry timeouts 45 timely renewal in case an existing connection's retry timeouts
47 have gotten too long; see section 8.3 of the draft.) 46 have gotten too long; see section 8.3 of the RFC.)
48 Therefore, lack of this feature may cause future clients to 47 Therefore, lack of this feature may cause future clients to
49 fail. 48 fail.
50 - Incomplete backchannel support: incomplete backchannel gss 49 - Incomplete backchannel support: incomplete backchannel gss
@@ -213,3 +212,10 @@ The following cases aren't supported yet:
213 DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. 212 DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
214* DESTROY_SESSION MUST be the final operation in the COMPOUND request. 213* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
215 214
215Nonstandard compound limitations:
216* No support for a sessions fore channel RPC compound that requires both a
217 ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
218 fail to live up to the promise we made in CREATE_SESSION fore channel
219 negotiation.
220* No more than one IO operation (read, write, readdir) allowed per
221 compound.
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 3ba0b945aaf8..3ba0b945aaf8 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
diff --git a/Documentation/filesystems/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea6808..8a382bea6808 100644
--- a/Documentation/filesystems/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 01539f410676..cf6d0d85ca82 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -28,7 +28,7 @@ described in the man pages included in the package.
28Project web page: http://www.nilfs.org/en/ 28Project web page: http://www.nilfs.org/en/
29Download page: http://www.nilfs.org/en/download.html 29Download page: http://www.nilfs.org/en/download.html
30Git tree web page: http://www.nilfs.org/git/ 30Git tree web page: http://www.nilfs.org/git/
31NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users 31List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
32 32
33Caveats 33Caveats
34======= 34=======
@@ -49,8 +49,7 @@ Mount options
49NILFS2 supports the following mount options: 49NILFS2 supports the following mount options:
50(*) == default 50(*) == default
51 51
52barrier=on(*) This enables/disables barriers. barrier=off disables 52nobarrier Disables barriers.
53 it, barrier=on enables it.
54errors=continue(*) Keep going on a filesystem error. 53errors=continue(*) Keep going on a filesystem error.
55errors=remount-ro Remount the filesystem read-only on an error. 54errors=remount-ro Remount the filesystem read-only on an error.
56errors=panic Panic and halt the machine if an error occurs. 55errors=panic Panic and halt the machine if an error occurs.
@@ -71,6 +70,13 @@ order=strict Apply strict in-order semantics that preserves sequence
71 blocks. That means, it is guaranteed that no 70 blocks. That means, it is guaranteed that no
72 overtaking of events occurs in the recovered file 71 overtaking of events occurs in the recovered file
73 system after a crash. 72 system after a crash.
73norecovery Disable recovery of the filesystem on mount.
74 This disables every write access on the device for
75 read-only mounts or snapshots. This option will fail
76 for r/w mounts on an unclean volume.
77discard Issue discard/TRIM commands to the underlying block
78 device when blocks are freed. This is useful for SSD
79 devices and sparse/thinly-provisioned LUNs.
74 80
75NILFS2 usage 81NILFS2 usage
76============ 82============
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c2a0871280a0..c58b9f5ba002 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -20,15 +20,16 @@ Lots of code taken from ext3 and other projects.
20Authors in alphabetical order: 20Authors in alphabetical order:
21Joel Becker <joel.becker@oracle.com> 21Joel Becker <joel.becker@oracle.com>
22Zach Brown <zach.brown@oracle.com> 22Zach Brown <zach.brown@oracle.com>
23Mark Fasheh <mark.fasheh@oracle.com> 23Mark Fasheh <mfasheh@suse.com>
24Kurt Hackel <kurt.hackel@oracle.com> 24Kurt Hackel <kurt.hackel@oracle.com>
25Tao Ma <tao.ma@oracle.com>
25Sunil Mushran <sunil.mushran@oracle.com> 26Sunil Mushran <sunil.mushran@oracle.com>
26Manish Singh <manish.singh@oracle.com> 27Manish Singh <manish.singh@oracle.com>
28Tiger Yang <tiger.yang@oracle.com>
27 29
28Caveats 30Caveats
29======= 31=======
30Features which OCFS2 does not support yet: 32Features which OCFS2 does not support yet:
31 - quotas
32 - Directory change notification (F_NOTIFY) 33 - Directory change notification (F_NOTIFY)
33 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) 34 - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
34 35
@@ -70,7 +71,6 @@ commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
70 performance. 71 performance.
71localalloc=8(*) Allows custom localalloc size in MB. If the value is too 72localalloc=8(*) Allows custom localalloc size in MB. If the value is too
72 large, the fs will silently revert it to the default. 73 large, the fs will silently revert it to the default.
73 Localalloc is not enabled for local mounts.
74localflocks This disables cluster aware flock. 74localflocks This disables cluster aware flock.
75inode64 Indicates that Ocfs2 is allowed to create inodes at 75inode64 Indicates that Ocfs2 is allowed to create inodes at
76 any location in the filesystem, including those which 76 any location in the filesystem, including those which
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 92b888d540a6..a7e9746ee7ea 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now.
140New super_block field "struct export_operations *s_export_op" for 140New super_block field "struct export_operations *s_export_op" for
141explicit support for exporting, e.g. via NFS. The structure is fully 141explicit support for exporting, e.g. via NFS. The structure is fully
142documented at its declaration in include/linux/fs.h, and in 142documented at its declaration in include/linux/fs.h, and in
143Documentation/filesystems/Exporting. 143Documentation/filesystems/nfs/Exporting.
144 144
145Briefly it allows for the definition of decode_fh and encode_fh operations 145Briefly it allows for the definition of decode_fh and encode_fh operations
146to encode and decode filehandles, and allows the filesystem to use 146to encode and decode filehandles, and allows the filesystem to use
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index b5aee7838a00..a4f30faa4f1f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
38 3.3 /proc/<pid>/io - Display the IO accounting fields 38 3.3 /proc/<pid>/io - Display the IO accounting fields
39 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings 39 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
40 3.5 /proc/<pid>/mountinfo - Information about mounts 40 3.5 /proc/<pid>/mountinfo - Information about mounts
41 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
41 42
42 43
43------------------------------------------------------------------------------ 44------------------------------------------------------------------------------
@@ -163,6 +164,7 @@ read the file /proc/PID/status:
163 VmExe: 68 kB 164 VmExe: 68 kB
164 VmLib: 1412 kB 165 VmLib: 1412 kB
165 VmPTE: 20 kb 166 VmPTE: 20 kb
167 VmSwap: 0 kB
166 Threads: 1 168 Threads: 1
167 SigQ: 0/28578 169 SigQ: 0/28578
168 SigPnd: 0000000000000000 170 SigPnd: 0000000000000000
@@ -176,7 +178,6 @@ read the file /proc/PID/status:
176 CapBnd: ffffffffffffffff 178 CapBnd: ffffffffffffffff
177 voluntary_ctxt_switches: 0 179 voluntary_ctxt_switches: 0
178 nonvoluntary_ctxt_switches: 1 180 nonvoluntary_ctxt_switches: 1
179 Stack usage: 12 kB
180 181
181This shows you nearly the same information you would get if you viewed it with 182This shows you nearly the same information you would get if you viewed it with
182the ps command. In fact, ps uses the proc file system to obtain its 183the ps command. In fact, ps uses the proc file system to obtain its
@@ -188,7 +189,13 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
188contains details information about the process itself. Its fields are 189contains details information about the process itself. Its fields are
189explained in Table 1-4. 190explained in Table 1-4.
190 191
191Table 1-2: Contents of the statm files (as of 2.6.30-rc7) 192(for SMP CONFIG users)
193For making accounting scalable, RSS related information are handled in
194asynchronous manner and the vaule may not be very precise. To see a precise
195snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
196It's slow but very precise.
197
198Table 1-2: Contents of the status files (as of 2.6.30-rc7)
192.............................................................................. 199..............................................................................
193 Field Content 200 Field Content
194 Name filename of the executable 201 Name filename of the executable
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
213 VmExe size of text segment 220 VmExe size of text segment
214 VmLib size of shared library code 221 VmLib size of shared library code
215 VmPTE size of page table entries 222 VmPTE size of page table entries
223 VmSwap size of swap usage (the number of referred swapents)
216 Threads number of threads 224 Threads number of threads
217 SigQ number of signals queued/max. number for queue 225 SigQ number of signals queued/max. number for queue
218 SigPnd bitmap of pending signals for the thread 226 SigPnd bitmap of pending signals for the thread
@@ -230,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
230 Mems_allowed_list Same as previous, but in "list format" 238 Mems_allowed_list Same as previous, but in "list format"
231 voluntary_ctxt_switches number of voluntary context switches 239 voluntary_ctxt_switches number of voluntary context switches
232 nonvoluntary_ctxt_switches number of non voluntary context switches 240 nonvoluntary_ctxt_switches number of non voluntary context switches
233 Stack usage: stack usage high water mark (round up to page size)
234.............................................................................. 241..............................................................................
235 242
236Table 1-3: Contents of the statm files (as of 2.6.8-rc3) 243Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
@@ -431,6 +438,7 @@ Table 1-5: Kernel info in /proc
431 modules List of loaded modules 438 modules List of loaded modules
432 mounts Mounted filesystems 439 mounts Mounted filesystems
433 net Networking info (see text) 440 net Networking info (see text)
441 pagetypeinfo Additional page allocator information (see text) (2.5)
434 partitions Table of partitions known to the system 442 partitions Table of partitions known to the system
435 pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, 443 pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
436 decoupled by lspci (2.4) 444 decoupled by lspci (2.4)
@@ -585,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ...
585Node 0, zone Normal 1 0 0 1 101 8 ... 593Node 0, zone Normal 1 0 0 1 101 8 ...
586Node 0, zone HighMem 2 0 0 1 1 0 ... 594Node 0, zone HighMem 2 0 0 1 1 0 ...
587 595
588Memory fragmentation is a problem under some workloads, and buddyinfo is a 596External fragmentation is a problem under some workloads, and buddyinfo is a
589useful tool for helping diagnose these problems. Buddyinfo will give you a 597useful tool for helping diagnose these problems. Buddyinfo will give you a
590clue as to how big an area you can safely allocate, or why a previous 598clue as to how big an area you can safely allocate, or why a previous
591allocation failed. 599allocation failed.
@@ -595,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
595ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE 603ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE
596available in ZONE_NORMAL, etc... 604available in ZONE_NORMAL, etc...
597 605
606More information relevant to external fragmentation can be found in
607pagetypeinfo.
608
609> cat /proc/pagetypeinfo
610Page block order: 9
611Pages per block: 512
612
613Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
614Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0
615Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
616Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2
617Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0
618Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
619Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9
620Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0
621Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452
622Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0
623Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
624
625Number of blocks type Unmovable Reclaimable Movable Reserve Isolate
626Node 0, zone DMA 2 0 5 1 0
627Node 0, zone DMA32 41 6 967 2 0
628
629Fragmentation avoidance in the kernel works by grouping pages of different
630migrate types into the same contiguous regions of memory called page blocks.
631A page block is typically the size of the default hugepage size e.g. 2MB on
632X86-64. By keeping pages grouped based on their ability to move, the kernel
633can reclaim pages within a page block to satisfy a high-order allocation.
634
635The pagetypinfo begins with information on the size of a page block. It
636then gives the same type of information as buddyinfo except broken down
637by migrate-type and finishes with details on how many page blocks of each
638type exist.
639
640If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
641from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
642make an estimate of the likely number of huge pages that can be allocated
643at a given point in time. All the "Movable" blocks should be allocatable
644unless memory has been mlock()'d. Some of the Reclaimable blocks should
645also be allocatable although a lot of filesystem metadata may have to be
646reclaimed to achieve this.
647
598.............................................................................. 648..............................................................................
599 649
600meminfo: 650meminfo:
@@ -1072,7 +1122,8 @@ second). The meanings of the columns are as follows, from left to right:
1072- irq: servicing interrupts 1122- irq: servicing interrupts
1073- softirq: servicing softirqs 1123- softirq: servicing softirqs
1074- steal: involuntary wait 1124- steal: involuntary wait
1075- guest: running a guest 1125- guest: running a normal guest
1126- guest_nice: running a niced guest
1076 1127
1077The "intr" line gives counts of interrupts serviced since boot time, for each 1128The "intr" line gives counts of interrupts serviced since boot time, for each
1078of the possible system interrupts. The first column is the total of all 1129of the possible system interrupts. The first column is the total of all
@@ -1088,8 +1139,8 @@ The "processes" line gives the number of processes and threads created, which
1088includes (but is not limited to) those created by calls to the fork() and 1139includes (but is not limited to) those created by calls to the fork() and
1089clone() system calls. 1140clone() system calls.
1090 1141
1091The "procs_running" line gives the number of processes currently running on 1142The "procs_running" line gives the total number of threads that are
1092CPUs. 1143running or ready to run (i.e., the total number of runnable threads).
1093 1144
1094The "procs_blocked" line gives the number of processes currently blocked, 1145The "procs_blocked" line gives the number of processes currently blocked,
1095waiting for I/O to complete. 1146waiting for I/O to complete.
@@ -1113,7 +1164,6 @@ Table 1-12: Files in /proc/fs/ext4/<devname>
1113.............................................................................. 1164..............................................................................
1114 File Content 1165 File Content
1115 mb_groups details of multiblock allocator buddy cache of free blocks 1166 mb_groups details of multiblock allocator buddy cache of free blocks
1116 mb_history multiblock allocation history
1117.............................................................................. 1167..............................................................................
1118 1168
1119 1169
@@ -1409,3 +1459,11 @@ For more information on mount propagation see:
1409 1459
1410 Documentation/filesystems/sharedsubtree.txt 1460 Documentation/filesystems/sharedsubtree.txt
1411 1461
1462
14633.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
1464--------------------------------------------------------
1465These files provide a method to access a tasks comm value. It also allows for
1466a task to set its own or one of its thread siblings comm value. The comm value
1467is limited in size compared to the cmdline value, so writing anything longer
1468then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
1469comm value.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 0d15ebccf5b0..a1e2e0dda907 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -248,9 +248,7 @@ code, that is done in the initialization code in the usual way:
248 { 248 {
249 struct proc_dir_entry *entry; 249 struct proc_dir_entry *entry;
250 250
251 entry = create_proc_entry("sequence", 0, NULL); 251 proc_create("sequence", 0, NULL, &ct_file_ops);
252 if (entry)
253 entry->proc_fops = &ct_file_ops;
254 return 0; 252 return 0;
255 } 253 }
256 254
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f94..fc0e39af43c3 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
837 individual lists does not affect propagation or the way propagation 837 individual lists does not affect propagation or the way propagation
838 tree is modified by operations. 838 tree is modified by operations.
839 839
840 All vfsmounts in a peer group have the same ->mnt_master. If it is
841 non-NULL, they form a contiguous (ordered) segment of slave list.
842
840 A example propagation tree looks as shown in the figure below. 843 A example propagation tree looks as shown in the figure below.
841 [ NOTE: Though it looks like a forest, if we consider all the shared 844 [ NOTE: Though it looks like a forest, if we consider all the shared
842 mounts as a conceptual entity called 'pnode', it becomes a tree] 845 mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
874 877
875 NOTE: The propagation tree is orthogonal to the mount tree. 878 NOTE: The propagation tree is orthogonal to the mount tree.
876 879
8808B Locking:
881
882 ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
883 by namespace_sem (exclusive for modifications, shared for reading).
884
885 Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
886 There are two exceptions: do_add_mount() and clone_mnt().
887 The former modifies a vfsmount that has not been visible in any shared
888 data structures yet.
889 The latter holds namespace_sem and the only references to vfsmount
890 are in lists that can't be traversed without namespace_sem.
877 891
8788B Algorithm: 8928C Algorithm:
879 893
880 The crux of the implementation resides in rbind/move operation. 894 The crux of the implementation resides in rbind/move operation.
881 895
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index b245d524d568..931c806642c5 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -91,8 +91,8 @@ struct device_attribute {
91 const char *buf, size_t count); 91 const char *buf, size_t count);
92}; 92};
93 93
94int device_create_file(struct device *, struct device_attribute *); 94int device_create_file(struct device *, const struct device_attribute *);
95void device_remove_file(struct device *, struct device_attribute *); 95void device_remove_file(struct device *, const struct device_attribute *);
96 96
97It also defines this helper for defining device attributes: 97It also defines this helper for defining device attributes:
98 98
@@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store);
316 316
317Creation/Removal: 317Creation/Removal:
318 318
319int device_create_file(struct device *device, struct device_attribute * attr); 319int device_create_file(struct device *dev, const struct device_attribute * attr);
320void device_remove_file(struct device * dev, struct device_attribute * attr); 320void device_remove_file(struct device *dev, const struct device_attribute * attr);
321 321
322 322
323- bus drivers (include/linux/device.h) 323- bus drivers (include/linux/device.h)
@@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store)
358 358
359Creation/Removal: 359Creation/Removal:
360 360
361int driver_create_file(struct device_driver *, struct driver_attribute *); 361int driver_create_file(struct device_driver *, const struct driver_attribute *);
362void driver_remove_file(struct device_driver *, struct driver_attribute *); 362void driver_remove_file(struct device_driver *, const struct driver_attribute *);
363 363
364 364
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index b58b84b50fa2..eed520fd0c8e 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -102,7 +102,7 @@ shortname=lower|win95|winnt|mixed
102 winnt: emulate the Windows NT rule for display/create. 102 winnt: emulate the Windows NT rule for display/create.
103 mixed: emulate the Windows NT rule for display, 103 mixed: emulate the Windows NT rule for display,
104 emulate the Windows 95 rule for create. 104 emulate the Windows 95 rule for create.
105 Default setting is `lower'. 105 Default setting is `mixed'.
106 106
107tz=UTC -- Interpret timestamps as UTC rather than local time. 107tz=UTC -- Interpret timestamps as UTC rather than local time.
108 This option disables the conversion of timestamps 108 This option disables the conversion of timestamps
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 623f094c9d8d..3de2f32edd90 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -472,7 +472,7 @@ __sync_single_inode) to check if ->writepages has been successful in
472writing out the whole address_space. 472writing out the whole address_space.
473 473
474The Writeback tag is used by filemap*wait* and sync_page* functions, 474The Writeback tag is used by filemap*wait* and sync_page* functions,
475via wait_on_page_writeback_range, to wait for all writeback to 475via filemap_fdatawait_range, to wait for all writeback to
476complete. While waiting ->sync_page (if defined) will be called on 476complete. While waiting ->sync_page (if defined) will be called on
477each page that is found to require writeback. 477each page that is found to require writeback.
478 478