diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-07-31 12:43:41 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-07-31 12:43:41 -0400 |
commit | 85e9ca333d03fbd56b9e123c8456f0d98e20faad (patch) | |
tree | 7bb15ada5f536950efa23ad60ea9eea60380ca1c /Documentation/filesystems | |
parent | a300bec952127d9a15e666b391bb35c9aecb3002 (diff) | |
parent | 6e86841d05f371b5b9b86ce76c02aaee83352298 (diff) |
Merge branch 'linus' into timers/hpet
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/Locking | 7 | ||||
-rw-r--r-- | Documentation/filesystems/bfs.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/configfs/configfs_example.c | 4 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 125 | ||||
-rw-r--r-- | Documentation/filesystems/gfs2-glocks.txt | 114 | ||||
-rw-r--r-- | Documentation/filesystems/nfs-rdma.txt | 103 | ||||
-rw-r--r-- | Documentation/filesystems/omfs.txt | 106 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 77 | ||||
-rw-r--r-- | Documentation/filesystems/relay.txt | 10 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs.txt | 6 | ||||
-rw-r--r-- | Documentation/filesystems/ubifs.txt | 164 | ||||
-rw-r--r-- | Documentation/filesystems/vfat.txt | 8 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 6 |
13 files changed, 623 insertions, 117 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 8b22d7d8b991..680fb566b928 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -510,6 +510,7 @@ prototypes: | |||
510 | void (*close)(struct vm_area_struct*); | 510 | void (*close)(struct vm_area_struct*); |
511 | int (*fault)(struct vm_area_struct*, struct vm_fault *); | 511 | int (*fault)(struct vm_area_struct*, struct vm_fault *); |
512 | int (*page_mkwrite)(struct vm_area_struct *, struct page *); | 512 | int (*page_mkwrite)(struct vm_area_struct *, struct page *); |
513 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); | ||
513 | 514 | ||
514 | locking rules: | 515 | locking rules: |
515 | BKL mmap_sem PageLocked(page) | 516 | BKL mmap_sem PageLocked(page) |
@@ -517,6 +518,7 @@ open: no yes | |||
517 | close: no yes | 518 | close: no yes |
518 | fault: no yes | 519 | fault: no yes |
519 | page_mkwrite: no yes no | 520 | page_mkwrite: no yes no |
521 | access: no yes | ||
520 | 522 | ||
521 | ->page_mkwrite() is called when a previously read-only page is | 523 | ->page_mkwrite() is called when a previously read-only page is |
522 | about to become writeable. The file system is responsible for | 524 | about to become writeable. The file system is responsible for |
@@ -525,6 +527,11 @@ taking to lock out truncate, the page range should be verified to be | |||
525 | within i_size. The page mapping should also be checked that it is not | 527 | within i_size. The page mapping should also be checked that it is not |
526 | NULL. | 528 | NULL. |
527 | 529 | ||
530 | ->access() is called when get_user_pages() fails in | ||
531 | acces_process_vm(), typically used to debug a process through | ||
532 | /proc/pid/mem or ptrace. This function is needed only for | ||
533 | VM_IO | VM_PFNMAP VMAs. | ||
534 | |||
528 | ================================================================================ | 535 | ================================================================================ |
529 | Dubious stuff | 536 | Dubious stuff |
530 | 537 | ||
diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt index ea825e178e79..78043d5a8fc3 100644 --- a/Documentation/filesystems/bfs.txt +++ b/Documentation/filesystems/bfs.txt | |||
@@ -26,11 +26,11 @@ You can simplify mounting by just typing: | |||
26 | 26 | ||
27 | this will allocate the first available loopback device (and load loop.o | 27 | this will allocate the first available loopback device (and load loop.o |
28 | kernel module if necessary) automatically. If the loopback driver is not | 28 | kernel module if necessary) automatically. If the loopback driver is not |
29 | loaded automatically, make sure that your kernel is compiled with kmod | 29 | loaded automatically, make sure that you have compiled the module and |
30 | support (CONFIG_KMOD) enabled. Beware that umount will not | 30 | that modprobe is functioning. Beware that umount will not deallocate |
31 | deallocate /dev/loopN device if /etc/mtab file on your system is a | 31 | /dev/loopN device if /etc/mtab file on your system is a symbolic link to |
32 | symbolic link to /proc/mounts. You will need to do it manually using | 32 | /proc/mounts. You will need to do it manually using "-d" switch of |
33 | "-d" switch of losetup(8). Read losetup(8) manpage for more info. | 33 | losetup(8). Read losetup(8) manpage for more info. |
34 | 34 | ||
35 | To create the BFS image under UnixWare you need to find out first which | 35 | To create the BFS image under UnixWare you need to find out first which |
36 | slice contains it. The command prtvtoc(1M) is your friend: | 36 | slice contains it. The command prtvtoc(1M) is your friend: |
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c index 25151fd5c2c6..039648791701 100644 --- a/Documentation/filesystems/configfs/configfs_example.c +++ b/Documentation/filesystems/configfs/configfs_example.c | |||
@@ -279,7 +279,7 @@ static struct config_item *simple_children_make_item(struct config_group *group, | |||
279 | 279 | ||
280 | simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL); | 280 | simple_child = kzalloc(sizeof(struct simple_child), GFP_KERNEL); |
281 | if (!simple_child) | 281 | if (!simple_child) |
282 | return NULL; | 282 | return ERR_PTR(-ENOMEM); |
283 | 283 | ||
284 | 284 | ||
285 | config_item_init_type_name(&simple_child->item, name, | 285 | config_item_init_type_name(&simple_child->item, name, |
@@ -366,7 +366,7 @@ static struct config_group *group_children_make_group(struct config_group *group | |||
366 | simple_children = kzalloc(sizeof(struct simple_children), | 366 | simple_children = kzalloc(sizeof(struct simple_children), |
367 | GFP_KERNEL); | 367 | GFP_KERNEL); |
368 | if (!simple_children) | 368 | if (!simple_children) |
369 | return NULL; | 369 | return ERR_PTR(-ENOMEM); |
370 | 370 | ||
371 | 371 | ||
372 | config_group_init_type_name(&simple_children->group, name, | 372 | config_group_init_type_name(&simple_children->group, name, |
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 0c5086db8352..80e193d82e2e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org | |||
13 | 1. Quick usage instructions: | 13 | 1. Quick usage instructions: |
14 | =========================== | 14 | =========================== |
15 | 15 | ||
16 | - Grab updated e2fsprogs from | 16 | - Compile and install the latest version of e2fsprogs (as of this |
17 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ | 17 | writing version 1.41) from: |
18 | This is a patchset on top of e2fsprogs-1.39, which can be found at | 18 | |
19 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
20 | |||
21 | or | ||
22 | |||
19 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | 23 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ |
20 | 24 | ||
21 | - It's still mke2fs -j /dev/hda1 | 25 | or grab the latest git repository from: |
26 | |||
27 | git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
28 | |||
29 | - Create a new filesystem using the ext4dev filesystem type: | ||
30 | |||
31 | # mke2fs -t ext4dev /dev/hda1 | ||
32 | |||
33 | Or configure an existing ext3 filesystem to support extents and set | ||
34 | the test_fs flag to indicate that it's ok for an in-development | ||
35 | filesystem to touch this filesystem: | ||
22 | 36 | ||
23 | - mount /dev/hda1 /wherever -t ext4dev | 37 | # tune2fs -O extents -E test_fs /dev/hda1 |
24 | 38 | ||
25 | - To enable extents, | 39 | If the filesystem was created with 128 byte inodes, it can be |
40 | converted to use 256 byte for greater efficiency via: | ||
26 | 41 | ||
27 | mount /dev/hda1 /wherever -t ext4dev -o extents | 42 | # tune2fs -I 256 /dev/hda1 |
28 | 43 | ||
29 | - The filesystem is compatible with the ext3 driver until you add a file | 44 | (Note: we currently do not have tools to convert an ext4dev |
30 | which has extents (ie: `mount -o extents', then create a file). | 45 | filesystem back to ext3; so please do not do try this on production |
46 | filesystems.) | ||
31 | 47 | ||
32 | NOTE: The "extents" mount flag is temporary. It will soon go away and | 48 | - Mounting: |
33 | extents will be enabled by the "-o extents" flag to mke2fs or tune2fs | 49 | |
50 | # mount -t ext4dev /dev/hda1 /wherever | ||
34 | 51 | ||
35 | - When comparing performance with other filesystems, remember that | 52 | - When comparing performance with other filesystems, remember that |
36 | ext3/4 by default offers higher data integrity guarantees than most. So | 53 | ext3/4 by default offers higher data integrity guarantees than most. |
37 | when comparing with a metadata-only journalling filesystem, use `mount -o | 54 | So when comparing with a metadata-only journalling filesystem, such |
38 | data=writeback'. And you might as well use `mount -o nobh' too along | 55 | as ext3, use `mount -o data=writeback'. And you might as well use |
39 | with it. Making the journal larger than the mke2fs default often helps | 56 | `mount -o nobh' too along with it. Making the journal larger than |
40 | performance with metadata-intensive workloads. | 57 | the mke2fs default often helps performance with metadata-intensive |
58 | workloads. | ||
41 | 59 | ||
42 | 2. Features | 60 | 2. Features |
43 | =========== | 61 | =========== |
44 | 62 | ||
45 | 2.1 Currently available | 63 | 2.1 Currently available |
46 | 64 | ||
47 | * ability to use filesystems > 16TB | 65 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) |
48 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | 66 | * extent format reduces metadata overhead (RAM, IO for access, transactions) |
49 | * extent format more robust in face of on-disk corruption due to magics, | 67 | * extent format more robust in face of on-disk corruption due to magics, |
50 | * internal redunancy in tree | 68 | * internal redunancy in tree |
51 | 69 | * improved file allocation (multi-block alloc) | |
52 | 2.1 Previously available, soon to be enabled by default by "mkefs.ext4": | 70 | * fix 32000 subdirectory limit |
53 | 71 | * nsec timestamps for mtime, atime, ctime, create time | |
54 | * dir_index and resize inode will be on by default | 72 | * inode version field on disk (NFSv4, Lustre) |
55 | * large inodes will be used by default for fast EAs, nsec timestamps, etc | 73 | * reduced e2fsck time via uninit_bg feature |
74 | * journal checksumming for robustness, performance | ||
75 | * persistent file preallocation (e.g for streaming media, databases) | ||
76 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
77 | flex_bg feature | ||
78 | * large file support | ||
79 | * Inode allocation using large virtual block groups via flex_bg | ||
80 | * delayed allocation | ||
81 | * large block (up to pagesize) support | ||
82 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | ||
83 | the ordering) | ||
56 | 84 | ||
57 | 2.2 Candidate features for future inclusion | 85 | 2.2 Candidate features for future inclusion |
58 | 86 | ||
59 | There are several under discussion, whether they all make it in is | 87 | * Online defrag (patches available but not well tested) |
60 | partly a function of how much time everyone has to work on them: | 88 | * reduced mke2fs time via lazy itable initialization in conjuction with |
89 | the uninit_bg feature (capability to do this is available in e2fsprogs | ||
90 | but a kernel thread to do lazy zeroing of unused inode table blocks | ||
91 | after filesystem is first mounted is required for safety) | ||
61 | 92 | ||
62 | * improved file allocation (multi-block alloc, delayed alloc; basically done) | 93 | There are several others under discussion, whether they all make it in is |
63 | * fix 32000 subdirectory limit (patch exists, needs some e2fsck work) | 94 | partly a function of how much time everyone has to work on them. Features like |
64 | * nsec timestamps for mtime, atime, ctime, create time (patch exists, | 95 | metadata checksumming have been discussed and planned for a bit but no patches |
65 | needs some e2fsck work) | 96 | exist yet so I'm not sure they're in the near-term roadmap. |
66 | * inode version field on disk (NFSv4, Lustre; prototype exists) | ||
67 | * reduced mke2fs/e2fsck time via uninitialized groups (prototype exists) | ||
68 | * journal checksumming for robustness, performance (prototype exists) | ||
69 | * persistent file preallocation (e.g for streaming media, databases) | ||
70 | 97 | ||
71 | Features like metadata checksumming have been discussed and planned for | 98 | The big performance win will come with mballoc, delalloc and flex_bg |
72 | a bit but no patches exist yet so I'm not sure they're in the near-term | 99 | grouping of bitmaps and inode tables. Some test results available here: |
73 | roadmap. | ||
74 | 100 | ||
75 | The big performance win will come with mballoc and delalloc. CFS has | 101 | - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html |
76 | been using mballoc for a few years already with Lustre, and IBM + Bull | 102 | - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html |
77 | did a lot of benchmarking on it. The reason it isn't in the first set of | ||
78 | patches is partly a manageability issue, and partly because it doesn't | ||
79 | directly affect the on-disk format (outside of much better allocation) | ||
80 | so it isn't critical to get into the first round of changes. I believe | ||
81 | Alex is working on a new set of patches right now. | ||
82 | 103 | ||
83 | 3. Options | 104 | 3. Options |
84 | ========== | 105 | ========== |
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try | |||
222 | to use for allocation size and alignment. For RAID5/6 | 243 | to use for allocation size and alignment. For RAID5/6 |
223 | systems this should be the number of data | 244 | systems this should be the number of data |
224 | disks * RAID chunk size in file system blocks. | 245 | disks * RAID chunk size in file system blocks. |
225 | 246 | delalloc (*) Deferring block allocation until write-out time. | |
247 | nodelalloc Disable delayed allocation. Blocks are allocation | ||
248 | when data is copied from user to page cache. | ||
226 | Data Mode | 249 | Data Mode |
227 | --------- | 250 | ========= |
228 | There are 3 different data modes: | 251 | There are 3 different data modes: |
229 | 252 | ||
230 | * writeback mode | 253 | * writeback mode |
@@ -236,10 +259,10 @@ typically provide the best ext4 performance. | |||
236 | 259 | ||
237 | * ordered mode | 260 | * ordered mode |
238 | In data=ordered mode, ext4 only officially journals metadata, but it logically | 261 | In data=ordered mode, ext4 only officially journals metadata, but it logically |
239 | groups metadata and data blocks into a single unit called a transaction. When | 262 | groups metadata information related to data changes with the data blocks into a |
240 | it's time to write the new metadata out to disk, the associated data blocks | 263 | single unit called a transaction. When it's time to write the new metadata |
241 | are written first. In general, this mode performs slightly slower than | 264 | out to disk, the associated data blocks are written first. In general, |
242 | writeback but significantly faster than journal mode. | 265 | this mode performs slightly slower than writeback but significantly faster than journal mode. |
243 | 266 | ||
244 | * journal mode | 267 | * journal mode |
245 | data=journal mode provides full data and metadata journaling. All new data is | 268 | data=journal mode provides full data and metadata journaling. All new data is |
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location. | |||
247 | In the event of a crash, the journal can be replayed, bringing both data and | 270 | In the event of a crash, the journal can be replayed, bringing both data and |
248 | metadata into a consistent state. This mode is the slowest except when data | 271 | metadata into a consistent state. This mode is the slowest except when data |
249 | needs to be read from and written to disk at the same time where it | 272 | needs to be read from and written to disk at the same time where it |
250 | outperforms all others modes. | 273 | outperforms all others modes. Curently ext4 does not have delayed |
274 | allocation support if this data journalling mode is selected. | ||
251 | 275 | ||
252 | References | 276 | References |
253 | ========== | 277 | ========== |
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/> | |||
256 | <file:fs/jbd2/> | 280 | <file:fs/jbd2/> |
257 | 281 | ||
258 | programs: http://e2fsprogs.sourceforge.net/ | 282 | programs: http://e2fsprogs.sourceforge.net/ |
259 | http://ext2resize.sourceforge.net | ||
260 | 283 | ||
261 | useful links: http://fedoraproject.org/wiki/ext3-devel | 284 | useful links: http://fedoraproject.org/wiki/ext3-devel |
262 | http://www.bullopensource.org/ext4/ | 285 | http://www.bullopensource.org/ext4/ |
286 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
287 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt new file mode 100644 index 000000000000..4dae9a3840bf --- /dev/null +++ b/Documentation/filesystems/gfs2-glocks.txt | |||
@@ -0,0 +1,114 @@ | |||
1 | Glock internal locking rules | ||
2 | ------------------------------ | ||
3 | |||
4 | This documents the basic principles of the glock state machine | ||
5 | internals. Each glock (struct gfs2_glock in fs/gfs2/incore.h) | ||
6 | has two main (internal) locks: | ||
7 | |||
8 | 1. A spinlock (gl_spin) which protects the internal state such | ||
9 | as gl_state, gl_target and the list of holders (gl_holders) | ||
10 | 2. A non-blocking bit lock, GLF_LOCK, which is used to prevent other | ||
11 | threads from making calls to the DLM, etc. at the same time. If a | ||
12 | thread takes this lock, it must then call run_queue (usually via the | ||
13 | workqueue) when it releases it in order to ensure any pending tasks | ||
14 | are completed. | ||
15 | |||
16 | The gl_holders list contains all the queued lock requests (not | ||
17 | just the holders) associated with the glock. If there are any | ||
18 | held locks, then they will be contiguous entries at the head | ||
19 | of the list. Locks are granted in strictly the order that they | ||
20 | are queued, except for those marked LM_FLAG_PRIORITY which are | ||
21 | used only during recovery, and even then only for journal locks. | ||
22 | |||
23 | There are three lock states that users of the glock layer can request, | ||
24 | namely shared (SH), deferred (DF) and exclusive (EX). Those translate | ||
25 | to the following DLM lock modes: | ||
26 | |||
27 | Glock mode | DLM lock mode | ||
28 | ------------------------------ | ||
29 | UN | IV/NL Unlocked (no DLM lock associated with glock) or NL | ||
30 | SH | PR (Protected read) | ||
31 | DF | CW (Concurrent write) | ||
32 | EX | EX (Exclusive) | ||
33 | |||
34 | Thus DF is basically a shared mode which is incompatible with the "normal" | ||
35 | shared lock mode, SH. In GFS2 the DF mode is used exclusively for direct I/O | ||
36 | operations. The glocks are basically a lock plus some routines which deal | ||
37 | with cache management. The following rules apply for the cache: | ||
38 | |||
39 | Glock mode | Cache data | Cache Metadata | Dirty Data | Dirty Metadata | ||
40 | -------------------------------------------------------------------------- | ||
41 | UN | No | No | No | No | ||
42 | SH | Yes | Yes | No | No | ||
43 | DF | No | Yes | No | No | ||
44 | EX | Yes | Yes | Yes | Yes | ||
45 | |||
46 | These rules are implemented using the various glock operations which | ||
47 | are defined for each type of glock. Not all types of glocks use | ||
48 | all the modes. Only inode glocks use the DF mode for example. | ||
49 | |||
50 | Table of glock operations and per type constants: | ||
51 | |||
52 | Field | Purpose | ||
53 | ---------------------------------------------------------------------------- | ||
54 | go_xmote_th | Called before remote state change (e.g. to sync dirty data) | ||
55 | go_xmote_bh | Called after remote state change (e.g. to refill cache) | ||
56 | go_inval | Called if remote state change requires invalidating the cache | ||
57 | go_demote_ok | Returns boolean value of whether its ok to demote a glock | ||
58 | | (e.g. checks timeout, and that there is no cached data) | ||
59 | go_lock | Called for the first local holder of a lock | ||
60 | go_unlock | Called on the final local unlock of a lock | ||
61 | go_dump | Called to print content of object for debugfs file, or on | ||
62 | | error to dump glock to the log. | ||
63 | go_type; | The type of the glock, LM_TYPE_..... | ||
64 | go_min_hold_time | The minimum hold time | ||
65 | |||
66 | The minimum hold time for each lock is the time after a remote lock | ||
67 | grant for which we ignore remote demote requests. This is in order to | ||
68 | prevent a situation where locks are being bounced around the cluster | ||
69 | from node to node with none of the nodes making any progress. This | ||
70 | tends to show up most with shared mmaped files which are being written | ||
71 | to by multiple nodes. By delaying the demotion in response to a | ||
72 | remote callback, that gives the userspace program time to make | ||
73 | some progress before the pages are unmapped. | ||
74 | |||
75 | There is a plan to try and remove the go_lock and go_unlock callbacks | ||
76 | if possible, in order to try and speed up the fast path though the locking. | ||
77 | Also, eventually we hope to make the glock "EX" mode locally shared | ||
78 | such that any local locking will be done with the i_mutex as required | ||
79 | rather than via the glock. | ||
80 | |||
81 | Locking rules for glock operations: | ||
82 | |||
83 | Operation | GLF_LOCK bit lock held | gl_spin spinlock held | ||
84 | ----------------------------------------------------------------- | ||
85 | go_xmote_th | Yes | No | ||
86 | go_xmote_bh | Yes | No | ||
87 | go_inval | Yes | No | ||
88 | go_demote_ok | Sometimes | Yes | ||
89 | go_lock | Yes | No | ||
90 | go_unlock | Yes | No | ||
91 | go_dump | Sometimes | Yes | ||
92 | |||
93 | N.B. Operations must not drop either the bit lock or the spinlock | ||
94 | if its held on entry. go_dump and do_demote_ok must never block. | ||
95 | Note that go_dump will only be called if the glock's state | ||
96 | indicates that it is caching uptodate data. | ||
97 | |||
98 | Glock locking order within GFS2: | ||
99 | |||
100 | 1. i_mutex (if required) | ||
101 | 2. Rename glock (for rename only) | ||
102 | 3. Inode glock(s) | ||
103 | (Parents before children, inodes at "same level" with same parent in | ||
104 | lock number order) | ||
105 | 4. Rgrp glock(s) (for (de)allocation operations) | ||
106 | 5. Transaction glock (via gfs2_trans_begin) for non-read operations | ||
107 | 6. Page lock (always last, very important!) | ||
108 | |||
109 | There are two glocks per inode. One deals with access to the inode | ||
110 | itself (locking order as above), and the other, known as the iopen | ||
111 | glock is used in conjunction with the i_nlink field in the inode to | ||
112 | determine the lifetime of the inode in question. Locking of inodes | ||
113 | is on a per-inode basis. Locking of rgrps is on a per rgrp basis. | ||
114 | |||
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs-rdma.txt index d0ec45ae4e7d..44bd766f2e5d 100644 --- a/Documentation/filesystems/nfs-rdma.txt +++ b/Documentation/filesystems/nfs-rdma.txt | |||
@@ -5,7 +5,7 @@ | |||
5 | ################################################################################ | 5 | ################################################################################ |
6 | 6 | ||
7 | Author: NetApp and Open Grid Computing | 7 | Author: NetApp and Open Grid Computing |
8 | Date: April 15, 2008 | 8 | Date: May 29, 2008 |
9 | 9 | ||
10 | Table of Contents | 10 | Table of Contents |
11 | ~~~~~~~~~~~~~~~~~ | 11 | ~~~~~~~~~~~~~~~~~ |
@@ -60,16 +60,18 @@ Installation | |||
60 | The procedures described in this document have been tested with | 60 | The procedures described in this document have been tested with |
61 | distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). | 61 | distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). |
62 | 62 | ||
63 | - Install nfs-utils-1.1.1 or greater on the client | 63 | - Install nfs-utils-1.1.2 or greater on the client |
64 | 64 | ||
65 | An NFS/RDMA mount point can only be obtained by using the mount.nfs | 65 | An NFS/RDMA mount point can be obtained by using the mount.nfs command in |
66 | command in nfs-utils-1.1.1 or greater. To see which version of mount.nfs | 66 | nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils |
67 | you are using, type: | 67 | version with support for NFS/RDMA mounts, but for various reasons we |
68 | recommend using nfs-utils-1.1.2 or greater). To see which version of | ||
69 | mount.nfs you are using, type: | ||
68 | 70 | ||
69 | > /sbin/mount.nfs -V | 71 | $ /sbin/mount.nfs -V |
70 | 72 | ||
71 | If the version is less than 1.1.1 or the command does not exist, | 73 | If the version is less than 1.1.2 or the command does not exist, |
72 | then you will need to install the latest version of nfs-utils. | 74 | you should install the latest version of nfs-utils. |
73 | 75 | ||
74 | Download the latest package from: | 76 | Download the latest package from: |
75 | 77 | ||
@@ -77,22 +79,33 @@ Installation | |||
77 | 79 | ||
78 | Uncompress the package and follow the installation instructions. | 80 | Uncompress the package and follow the installation instructions. |
79 | 81 | ||
80 | If you will not be using GSS and NFSv4, the installation process | 82 | If you will not need the idmapper and gssd executables (you do not need |
81 | can be simplified by disabling these features when running configure: | 83 | these to create an NFS/RDMA enabled mount command), the installation |
84 | process can be simplified by disabling these features when running | ||
85 | configure: | ||
82 | 86 | ||
83 | > ./configure --disable-gss --disable-nfsv4 | 87 | $ ./configure --disable-gss --disable-nfsv4 |
84 | 88 | ||
85 | For more information on this see the package's README and INSTALL files. | 89 | To build nfs-utils you will need the tcp_wrappers package installed. For |
90 | more information on this see the package's README and INSTALL files. | ||
86 | 91 | ||
87 | After building the nfs-utils package, there will be a mount.nfs binary in | 92 | After building the nfs-utils package, there will be a mount.nfs binary in |
88 | the utils/mount directory. This binary can be used to initiate NFS v2, v3, | 93 | the utils/mount directory. This binary can be used to initiate NFS v2, v3, |
89 | or v4 mounts. To initiate a v4 mount, the binary must be called mount.nfs4. | 94 | or v4 mounts. To initiate a v4 mount, the binary must be called |
90 | The standard technique is to create a symlink called mount.nfs4 to mount.nfs. | 95 | mount.nfs4. The standard technique is to create a symlink called |
96 | mount.nfs4 to mount.nfs. | ||
91 | 97 | ||
92 | NOTE: mount.nfs and therefore nfs-utils-1.1.1 or greater is only needed | 98 | This mount.nfs binary should be installed at /sbin/mount.nfs as follows: |
99 | |||
100 | $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs | ||
101 | |||
102 | In this location, mount.nfs will be invoked automatically for NFS mounts | ||
103 | by the system mount commmand. | ||
104 | |||
105 | NOTE: mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed | ||
93 | on the NFS client machine. You do not need this specific version of | 106 | on the NFS client machine. You do not need this specific version of |
94 | nfs-utils on the server. Furthermore, only the mount.nfs command from | 107 | nfs-utils on the server. Furthermore, only the mount.nfs command from |
95 | nfs-utils-1.1.1 is needed on the client. | 108 | nfs-utils-1.1.2 is needed on the client. |
96 | 109 | ||
97 | - Install a Linux kernel with NFS/RDMA | 110 | - Install a Linux kernel with NFS/RDMA |
98 | 111 | ||
@@ -156,8 +169,8 @@ Check RDMA and NFS Setup | |||
156 | this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel | 169 | this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel |
157 | card: | 170 | card: |
158 | 171 | ||
159 | > modprobe ib_mthca | 172 | $ modprobe ib_mthca |
160 | > modprobe ib_ipoib | 173 | $ modprobe ib_ipoib |
161 | 174 | ||
162 | If you are using InfiniBand, make sure there is a Subnet Manager (SM) | 175 | If you are using InfiniBand, make sure there is a Subnet Manager (SM) |
163 | running on the network. If your IB switch has an embedded SM, you can | 176 | running on the network. If your IB switch has an embedded SM, you can |
@@ -166,7 +179,7 @@ Check RDMA and NFS Setup | |||
166 | 179 | ||
167 | If an SM is running on your network, you should see the following: | 180 | If an SM is running on your network, you should see the following: |
168 | 181 | ||
169 | > cat /sys/class/infiniband/driverX/ports/1/state | 182 | $ cat /sys/class/infiniband/driverX/ports/1/state |
170 | 4: ACTIVE | 183 | 4: ACTIVE |
171 | 184 | ||
172 | where driverX is mthca0, ipath5, ehca3, etc. | 185 | where driverX is mthca0, ipath5, ehca3, etc. |
@@ -174,10 +187,10 @@ Check RDMA and NFS Setup | |||
174 | To further test the InfiniBand software stack, use IPoIB (this | 187 | To further test the InfiniBand software stack, use IPoIB (this |
175 | assumes you have two IB hosts named host1 and host2): | 188 | assumes you have two IB hosts named host1 and host2): |
176 | 189 | ||
177 | host1> ifconfig ib0 a.b.c.x | 190 | host1$ ifconfig ib0 a.b.c.x |
178 | host2> ifconfig ib0 a.b.c.y | 191 | host2$ ifconfig ib0 a.b.c.y |
179 | host1> ping a.b.c.y | 192 | host1$ ping a.b.c.y |
180 | host2> ping a.b.c.x | 193 | host2$ ping a.b.c.x |
181 | 194 | ||
182 | For other device types, follow the appropriate procedures. | 195 | For other device types, follow the appropriate procedures. |
183 | 196 | ||
@@ -202,11 +215,11 @@ NFS/RDMA Setup | |||
202 | /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) | 215 | /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) |
203 | /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) | 216 | /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) |
204 | 217 | ||
205 | The IP address(es) is(are) the client's IPoIB address for an InfiniBand HCA or the | 218 | The IP address(es) is(are) the client's IPoIB address for an InfiniBand |
206 | cleint's iWARP address(es) for an RNIC. | 219 | HCA or the cleint's iWARP address(es) for an RNIC. |
207 | 220 | ||
208 | NOTE: The "insecure" option must be used because the NFS/RDMA client does not | 221 | NOTE: The "insecure" option must be used because the NFS/RDMA client does |
209 | use a reserved port. | 222 | not use a reserved port. |
210 | 223 | ||
211 | Each time a machine boots: | 224 | Each time a machine boots: |
212 | 225 | ||
@@ -214,43 +227,45 @@ NFS/RDMA Setup | |||
214 | 227 | ||
215 | For InfiniBand using a Mellanox adapter: | 228 | For InfiniBand using a Mellanox adapter: |
216 | 229 | ||
217 | > modprobe ib_mthca | 230 | $ modprobe ib_mthca |
218 | > modprobe ib_ipoib | 231 | $ modprobe ib_ipoib |
219 | > ifconfig ib0 a.b.c.d | 232 | $ ifconfig ib0 a.b.c.d |
220 | 233 | ||
221 | NOTE: use unique addresses for the client and server | 234 | NOTE: use unique addresses for the client and server |
222 | 235 | ||
223 | - Start the NFS server | 236 | - Start the NFS server |
224 | 237 | ||
225 | If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config), | 238 | If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in |
226 | load the RDMA transport module: | 239 | kernel config), load the RDMA transport module: |
227 | 240 | ||
228 | > modprobe svcrdma | 241 | $ modprobe svcrdma |
229 | 242 | ||
230 | Regardless of how the server was built (module or built-in), start the server: | 243 | Regardless of how the server was built (module or built-in), start the |
244 | server: | ||
231 | 245 | ||
232 | > /etc/init.d/nfs start | 246 | $ /etc/init.d/nfs start |
233 | 247 | ||
234 | or | 248 | or |
235 | 249 | ||
236 | > service nfs start | 250 | $ service nfs start |
237 | 251 | ||
238 | Instruct the server to listen on the RDMA transport: | 252 | Instruct the server to listen on the RDMA transport: |
239 | 253 | ||
240 | > echo rdma 2050 > /proc/fs/nfsd/portlist | 254 | $ echo rdma 2050 > /proc/fs/nfsd/portlist |
241 | 255 | ||
242 | - On the client system | 256 | - On the client system |
243 | 257 | ||
244 | If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in kernel config), | 258 | If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in |
245 | load the RDMA client module: | 259 | kernel config), load the RDMA client module: |
246 | 260 | ||
247 | > modprobe xprtrdma.ko | 261 | $ modprobe xprtrdma.ko |
248 | 262 | ||
249 | Regardless of how the client was built (module or built-in), issue the mount.nfs command: | 263 | Regardless of how the client was built (module or built-in), use this |
264 | command to mount the NFS/RDMA server: | ||
250 | 265 | ||
251 | > /path/to/your/mount.nfs <IPoIB-server-name-or-address>:/<export> /mnt -i -o rdma,port=2050 | 266 | $ mount -o rdma,port=2050 <IPoIB-server-name-or-address>:/<export> /mnt |
252 | 267 | ||
253 | To verify that the mount is using RDMA, run "cat /proc/mounts" and check the | 268 | To verify that the mount is using RDMA, run "cat /proc/mounts" and check |
254 | "proto" field for the given mount. | 269 | the "proto" field for the given mount. |
255 | 270 | ||
256 | Congratulations! You're using NFS/RDMA! | 271 | Congratulations! You're using NFS/RDMA! |
diff --git a/Documentation/filesystems/omfs.txt b/Documentation/filesystems/omfs.txt new file mode 100644 index 000000000000..1d0d41ff5c65 --- /dev/null +++ b/Documentation/filesystems/omfs.txt | |||
@@ -0,0 +1,106 @@ | |||
1 | Optimized MPEG Filesystem (OMFS) | ||
2 | |||
3 | Overview | ||
4 | ======== | ||
5 | |||
6 | OMFS is a filesystem created by SonicBlue for use in the ReplayTV DVR | ||
7 | and Rio Karma MP3 player. The filesystem is extent-based, utilizing | ||
8 | block sizes from 2k to 8k, with hash-based directories. This | ||
9 | filesystem driver may be used to read and write disks from these | ||
10 | devices. | ||
11 | |||
12 | Note, it is not recommended that this FS be used in place of a general | ||
13 | filesystem for your own streaming media device. Native Linux filesystems | ||
14 | will likely perform better. | ||
15 | |||
16 | More information is available at: | ||
17 | |||
18 | http://linux-karma.sf.net/ | ||
19 | |||
20 | Various utilities, including mkomfs and omfsck, are included with | ||
21 | omfsprogs, available at: | ||
22 | |||
23 | http://bobcopeland.com/karma/ | ||
24 | |||
25 | Instructions are included in its README. | ||
26 | |||
27 | Options | ||
28 | ======= | ||
29 | |||
30 | OMFS supports the following mount-time options: | ||
31 | |||
32 | uid=n - make all files owned by specified user | ||
33 | gid=n - make all files owned by specified group | ||
34 | umask=xxx - set permission umask to xxx | ||
35 | fmask=xxx - set umask to xxx for files | ||
36 | dmask=xxx - set umask to xxx for directories | ||
37 | |||
38 | Disk format | ||
39 | =========== | ||
40 | |||
41 | OMFS discriminates between "sysblocks" and normal data blocks. The sysblock | ||
42 | group consists of super block information, file metadata, directory structures, | ||
43 | and extents. Each sysblock has a header containing CRCs of the entire | ||
44 | sysblock, and may be mirrored in successive blocks on the disk. A sysblock may | ||
45 | have a smaller size than a data block, but since they are both addressed by the | ||
46 | same 64-bit block number, any remaining space in the smaller sysblock is | ||
47 | unused. | ||
48 | |||
49 | Sysblock header information: | ||
50 | |||
51 | struct omfs_header { | ||
52 | __be64 h_self; /* FS block where this is located */ | ||
53 | __be32 h_body_size; /* size of useful data after header */ | ||
54 | __be16 h_crc; /* crc-ccitt of body_size bytes */ | ||
55 | char h_fill1[2]; | ||
56 | u8 h_version; /* version, always 1 */ | ||
57 | char h_type; /* OMFS_INODE_X */ | ||
58 | u8 h_magic; /* OMFS_IMAGIC */ | ||
59 | u8 h_check_xor; /* XOR of header bytes before this */ | ||
60 | __be32 h_fill2; | ||
61 | }; | ||
62 | |||
63 | Files and directories are both represented by omfs_inode: | ||
64 | |||
65 | struct omfs_inode { | ||
66 | struct omfs_header i_head; /* header */ | ||
67 | __be64 i_parent; /* parent containing this inode */ | ||
68 | __be64 i_sibling; /* next inode in hash bucket */ | ||
69 | __be64 i_ctime; /* ctime, in milliseconds */ | ||
70 | char i_fill1[35]; | ||
71 | char i_type; /* OMFS_[DIR,FILE] */ | ||
72 | __be32 i_fill2; | ||
73 | char i_fill3[64]; | ||
74 | char i_name[OMFS_NAMELEN]; /* filename */ | ||
75 | __be64 i_size; /* size of file, in bytes */ | ||
76 | }; | ||
77 | |||
78 | Directories in OMFS are implemented as a large hash table. Filenames are | ||
79 | hashed then prepended into the bucket list beginning at OMFS_DIR_START. | ||
80 | Lookup requires hashing the filename, then seeking across i_sibling pointers | ||
81 | until a match is found on i_name. Empty buckets are represented by block | ||
82 | pointers with all-1s (~0). | ||
83 | |||
84 | A file is an omfs_inode structure followed by an extent table beginning at | ||
85 | OMFS_EXTENT_START: | ||
86 | |||
87 | struct omfs_extent_entry { | ||
88 | __be64 e_cluster; /* start location of a set of blocks */ | ||
89 | __be64 e_blocks; /* number of blocks after e_cluster */ | ||
90 | }; | ||
91 | |||
92 | struct omfs_extent { | ||
93 | __be64 e_next; /* next extent table location */ | ||
94 | __be32 e_extent_count; /* total # extents in this table */ | ||
95 | __be32 e_fill; | ||
96 | struct omfs_extent_entry e_entry; /* start of extent entries */ | ||
97 | }; | ||
98 | |||
99 | Each extent holds the block offset followed by number of blocks allocated to | ||
100 | the extent. The final extent in each table is a terminator with e_cluster | ||
101 | being ~0 and e_blocks being ones'-complement of the total number of blocks | ||
102 | in the table. | ||
103 | |||
104 | If this table overflows, a continuation inode is written and pointed to by | ||
105 | e_next. These have a header but lack the rest of the inode structure. | ||
106 | |||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index dbc3c6a3650f..64557821ee59 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -296,6 +296,7 @@ Table 1-4: Kernel info in /proc | |||
296 | uptime System uptime | 296 | uptime System uptime |
297 | version Kernel version | 297 | version Kernel version |
298 | video bttv info of video resources (2.4) | 298 | video bttv info of video resources (2.4) |
299 | vmallocinfo Show vmalloced areas | ||
299 | .............................................................................. | 300 | .............................................................................. |
300 | 301 | ||
301 | You can, for example, check which interrupts are currently in use and what | 302 | You can, for example, check which interrupts are currently in use and what |
@@ -380,28 +381,35 @@ i386 and x86_64 platforms support the new IRQ vector displays. | |||
380 | Of some interest is the introduction of the /proc/irq directory to 2.4. | 381 | Of some interest is the introduction of the /proc/irq directory to 2.4. |
381 | It could be used to set IRQ to CPU affinity, this means that you can "hook" an | 382 | It could be used to set IRQ to CPU affinity, this means that you can "hook" an |
382 | IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the | 383 | IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the |
383 | irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask | 384 | irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and |
385 | prof_cpu_mask. | ||
384 | 386 | ||
385 | For example | 387 | For example |
386 | > ls /proc/irq/ | 388 | > ls /proc/irq/ |
387 | 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask | 389 | 0 10 12 14 16 18 2 4 6 8 prof_cpu_mask |
388 | 1 11 13 15 17 19 3 5 7 9 | 390 | 1 11 13 15 17 19 3 5 7 9 default_smp_affinity |
389 | > ls /proc/irq/0/ | 391 | > ls /proc/irq/0/ |
390 | smp_affinity | 392 | smp_affinity |
391 | 393 | ||
392 | The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ | 394 | smp_affinity is a bitmask, in which you can specify which CPUs can handle the |
393 | is the same by default: | 395 | IRQ, you can set it by doing: |
394 | 396 | ||
395 | > cat /proc/irq/0/smp_affinity | 397 | > echo 1 > /proc/irq/10/smp_affinity |
396 | ffffffff | 398 | |
399 | This means that only the first CPU will handle the IRQ, but you can also echo | ||
400 | 5 which means that only the first and fourth CPU can handle the IRQ. | ||
401 | |||
402 | The contents of each smp_affinity file is the same by default: | ||
397 | 403 | ||
398 | It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can | 404 | > cat /proc/irq/0/smp_affinity |
399 | set it by doing: | 405 | ffffffff |
400 | 406 | ||
401 | > echo 1 > /proc/irq/prof_cpu_mask | 407 | The default_smp_affinity mask applies to all non-active IRQs, which are the |
408 | IRQs which have not yet been allocated/activated, and hence which lack a | ||
409 | /proc/irq/[0-9]* directory. | ||
402 | 410 | ||
403 | This means that only the first CPU will handle the IRQ, but you can also echo 5 | 411 | prof_cpu_mask specifies which CPUs are to be profiled by the system wide |
404 | which means that only the first and fourth CPU can handle the IRQ. | 412 | profiler. Default value is ffffffff (all cpus). |
405 | 413 | ||
406 | The way IRQs are routed is handled by the IO-APIC, and it's Round Robin | 414 | The way IRQs are routed is handled by the IO-APIC, and it's Round Robin |
407 | between all the CPUs which are allowed to handle it. As usual the kernel has | 415 | between all the CPUs which are allowed to handle it. As usual the kernel has |
@@ -550,6 +558,49 @@ VmallocTotal: total size of vmalloc memory area | |||
550 | VmallocUsed: amount of vmalloc area which is used | 558 | VmallocUsed: amount of vmalloc area which is used |
551 | VmallocChunk: largest contigious block of vmalloc area which is free | 559 | VmallocChunk: largest contigious block of vmalloc area which is free |
552 | 560 | ||
561 | .............................................................................. | ||
562 | |||
563 | vmallocinfo: | ||
564 | |||
565 | Provides information about vmalloced/vmaped areas. One line per area, | ||
566 | containing the virtual address range of the area, size in bytes, | ||
567 | caller information of the creator, and optional information depending | ||
568 | on the kind of area : | ||
569 | |||
570 | pages=nr number of pages | ||
571 | phys=addr if a physical address was specified | ||
572 | ioremap I/O mapping (ioremap() and friends) | ||
573 | vmalloc vmalloc() area | ||
574 | vmap vmap()ed pages | ||
575 | user VM_USERMAP area | ||
576 | vpages buffer for pages pointers was vmalloced (huge area) | ||
577 | N<node>=nr (Only on NUMA kernels) | ||
578 | Number of pages allocated on memory node <node> | ||
579 | |||
580 | > cat /proc/vmallocinfo | ||
581 | 0xffffc20000000000-0xffffc20000201000 2101248 alloc_large_system_hash+0x204 ... | ||
582 | /0x2c0 pages=512 vmalloc N0=128 N1=128 N2=128 N3=128 | ||
583 | 0xffffc20000201000-0xffffc20000302000 1052672 alloc_large_system_hash+0x204 ... | ||
584 | /0x2c0 pages=256 vmalloc N0=64 N1=64 N2=64 N3=64 | ||
585 | 0xffffc20000302000-0xffffc20000304000 8192 acpi_tb_verify_table+0x21/0x4f... | ||
586 | phys=7fee8000 ioremap | ||
587 | 0xffffc20000304000-0xffffc20000307000 12288 acpi_tb_verify_table+0x21/0x4f... | ||
588 | phys=7fee7000 ioremap | ||
589 | 0xffffc2000031d000-0xffffc2000031f000 8192 init_vdso_vars+0x112/0x210 | ||
590 | 0xffffc2000031f000-0xffffc2000032b000 49152 cramfs_uncompress_init+0x2e ... | ||
591 | /0x80 pages=11 vmalloc N0=3 N1=3 N2=2 N3=3 | ||
592 | 0xffffc2000033a000-0xffffc2000033d000 12288 sys_swapon+0x640/0xac0 ... | ||
593 | pages=2 vmalloc N1=2 | ||
594 | 0xffffc20000347000-0xffffc2000034c000 20480 xt_alloc_table_info+0xfe ... | ||
595 | /0x130 [x_tables] pages=4 vmalloc N0=4 | ||
596 | 0xffffffffa0000000-0xffffffffa000f000 61440 sys_init_module+0xc27/0x1d00 ... | ||
597 | pages=14 vmalloc N2=14 | ||
598 | 0xffffffffa000f000-0xffffffffa0014000 20480 sys_init_module+0xc27/0x1d00 ... | ||
599 | pages=4 vmalloc N1=4 | ||
600 | 0xffffffffa0014000-0xffffffffa0017000 12288 sys_init_module+0xc27/0x1d00 ... | ||
601 | pages=2 vmalloc N1=2 | ||
602 | 0xffffffffa0017000-0xffffffffa0022000 45056 sys_init_module+0xc27/0x1d00 ... | ||
603 | pages=10 vmalloc N0=10 | ||
553 | 604 | ||
554 | 1.3 IDE devices in /proc/ide | 605 | 1.3 IDE devices in /proc/ide |
555 | ---------------------------- | 606 | ---------------------------- |
@@ -880,7 +931,7 @@ group_prealloc max_to_scan mb_groups mb_history min_to_scan order2_req | |||
880 | stats stream_req | 931 | stats stream_req |
881 | 932 | ||
882 | mb_groups: | 933 | mb_groups: |
883 | This file gives the details of mutiblock allocator buddy cache of free blocks | 934 | This file gives the details of multiblock allocator buddy cache of free blocks |
884 | 935 | ||
885 | mb_history: | 936 | mb_history: |
886 | Multiblock allocation history. | 937 | Multiblock allocation history. |
@@ -1423,7 +1474,7 @@ used because pages_free(1355) is smaller than watermark + protection[2] | |||
1423 | normal page requirement. If requirement is DMA zone(index=0), protection[0] | 1474 | normal page requirement. If requirement is DMA zone(index=0), protection[0] |
1424 | (=0) is used. | 1475 | (=0) is used. |
1425 | 1476 | ||
1426 | zone[i]'s protection[j] is calculated by following exprssion. | 1477 | zone[i]'s protection[j] is calculated by following expression. |
1427 | 1478 | ||
1428 | (i < j): | 1479 | (i < j): |
1429 | zone[i]->protection[j] | 1480 | zone[i]->protection[j] |
diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt index 094f2d2f38b1..510b722667ac 100644 --- a/Documentation/filesystems/relay.txt +++ b/Documentation/filesystems/relay.txt | |||
@@ -294,6 +294,16 @@ user-defined data with a channel, and is immediately available | |||
294 | (including in create_buf_file()) via chan->private_data or | 294 | (including in create_buf_file()) via chan->private_data or |
295 | buf->chan->private_data. | 295 | buf->chan->private_data. |
296 | 296 | ||
297 | Buffer-only channels | ||
298 | -------------------- | ||
299 | |||
300 | These channels have no files associated and can be created with | ||
301 | relay_open(NULL, NULL, ...). Such channels are useful in scenarios such | ||
302 | as when doing early tracing in the kernel, before the VFS is up. In these | ||
303 | cases, one may open a buffer-only channel and then call | ||
304 | relay_late_setup_files() when the kernel is ready to handle files, | ||
305 | to expose the buffered data to the userspace. | ||
306 | |||
297 | Channel 'modes' | 307 | Channel 'modes' |
298 | --------------- | 308 | --------------- |
299 | 309 | ||
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 7f27b8f840d0..9e9c348275a9 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt | |||
@@ -248,6 +248,7 @@ The top level sysfs directory looks like: | |||
248 | block/ | 248 | block/ |
249 | bus/ | 249 | bus/ |
250 | class/ | 250 | class/ |
251 | dev/ | ||
251 | devices/ | 252 | devices/ |
252 | firmware/ | 253 | firmware/ |
253 | net/ | 254 | net/ |
@@ -274,6 +275,11 @@ fs/ contains a directory for some filesystems. Currently each | |||
274 | filesystem wanting to export attributes must create its own hierarchy | 275 | filesystem wanting to export attributes must create its own hierarchy |
275 | below fs/ (see ./fuse.txt for an example). | 276 | below fs/ (see ./fuse.txt for an example). |
276 | 277 | ||
278 | dev/ contains two directories char/ and block/. Inside these two | ||
279 | directories there are symlinks named <major>:<minor>. These symlinks | ||
280 | point to the sysfs directory for the given device. /sys/dev provides a | ||
281 | quick way to lookup the sysfs interface for a device from the result of | ||
282 | a stat(2) operation. | ||
277 | 283 | ||
278 | More information can driver-model specific features can be found in | 284 | More information can driver-model specific features can be found in |
279 | Documentation/driver-model/. | 285 | Documentation/driver-model/. |
diff --git a/Documentation/filesystems/ubifs.txt b/Documentation/filesystems/ubifs.txt new file mode 100644 index 000000000000..540e9e7f59c5 --- /dev/null +++ b/Documentation/filesystems/ubifs.txt | |||
@@ -0,0 +1,164 @@ | |||
1 | Introduction | ||
2 | ============= | ||
3 | |||
4 | UBIFS file-system stands for UBI File System. UBI stands for "Unsorted | ||
5 | Block Images". UBIFS is a flash file system, which means it is designed | ||
6 | to work with flash devices. It is important to understand, that UBIFS | ||
7 | is completely different to any traditional file-system in Linux, like | ||
8 | Ext2, XFS, JFS, etc. UBIFS represents a separate class of file-systems | ||
9 | which work with MTD devices, not block devices. The other Linux | ||
10 | file-system of this class is JFFS2. | ||
11 | |||
12 | To make it more clear, here is a small comparison of MTD devices and | ||
13 | block devices. | ||
14 | |||
15 | 1 MTD devices represent flash devices and they consist of eraseblocks of | ||
16 | rather large size, typically about 128KiB. Block devices consist of | ||
17 | small blocks, typically 512 bytes. | ||
18 | 2 MTD devices support 3 main operations - read from some offset within an | ||
19 | eraseblock, write to some offset within an eraseblock, and erase a whole | ||
20 | eraseblock. Block devices support 2 main operations - read a whole | ||
21 | block and write a whole block. | ||
22 | 3 The whole eraseblock has to be erased before it becomes possible to | ||
23 | re-write its contents. Blocks may be just re-written. | ||
24 | 4 Eraseblocks become worn out after some number of erase cycles - | ||
25 | typically 100K-1G for SLC NAND and NOR flashes, and 1K-10K for MLC | ||
26 | NAND flashes. Blocks do not have the wear-out property. | ||
27 | 5 Eraseblocks may become bad (only on NAND flashes) and software should | ||
28 | deal with this. Blocks on hard drives typically do not become bad, | ||
29 | because hardware has mechanisms to substitute bad blocks, at least in | ||
30 | modern LBA disks. | ||
31 | |||
32 | It should be quite obvious why UBIFS is very different to traditional | ||
33 | file-systems. | ||
34 | |||
35 | UBIFS works on top of UBI. UBI is a separate software layer which may be | ||
36 | found in drivers/mtd/ubi. UBI is basically a volume management and | ||
37 | wear-leveling layer. It provides so called UBI volumes which is a higher | ||
38 | level abstraction than a MTD device. The programming model of UBI devices | ||
39 | is very similar to MTD devices - they still consist of large eraseblocks, | ||
40 | they have read/write/erase operations, but UBI devices are devoid of | ||
41 | limitations like wear and bad blocks (items 4 and 5 in the above list). | ||
42 | |||
43 | In a sense, UBIFS is a next generation of JFFS2 file-system, but it is | ||
44 | very different and incompatible to JFFS2. The following are the main | ||
45 | differences. | ||
46 | |||
47 | * JFFS2 works on top of MTD devices, UBIFS depends on UBI and works on | ||
48 | top of UBI volumes. | ||
49 | * JFFS2 does not have on-media index and has to build it while mounting, | ||
50 | which requires full media scan. UBIFS maintains the FS indexing | ||
51 | information on the flash media and does not require full media scan, | ||
52 | so it mounts many times faster than JFFS2. | ||
53 | * JFFS2 is a write-through file-system, while UBIFS supports write-back, | ||
54 | which makes UBIFS much faster on writes. | ||
55 | |||
56 | Similarly to JFFS2, UBIFS supports on-the-flight compression which makes | ||
57 | it possible to fit quite a lot of data to the flash. | ||
58 | |||
59 | Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts. | ||
60 | It does not need stuff like ckfs.ext2. UBIFS automatically replays its | ||
61 | journal and recovers from crashes, ensuring that the on-flash data | ||
62 | structures are consistent. | ||
63 | |||
64 | UBIFS scales logarithmically (most of the data structures it uses are | ||
65 | trees), so the mount time and memory consumption do not linearly depend | ||
66 | on the flash size, like in case of JFFS2. This is because UBIFS | ||
67 | maintains the FS index on the flash media. However, UBIFS depends on | ||
68 | UBI, which scales linearly. So overall UBI/UBIFS stack scales linearly. | ||
69 | Nevertheless, UBI/UBIFS scales considerably better than JFFS2. | ||
70 | |||
71 | The authors of UBIFS believe, that it is possible to develop UBI2 which | ||
72 | would scale logarithmically as well. UBI2 would support the same API as UBI, | ||
73 | but it would be binary incompatible to UBI. So UBIFS would not need to be | ||
74 | changed to use UBI2 | ||
75 | |||
76 | |||
77 | Mount options | ||
78 | ============= | ||
79 | |||
80 | (*) == default. | ||
81 | |||
82 | norm_unmount (*) commit on unmount; the journal is committed | ||
83 | when the file-system is unmounted so that the | ||
84 | next mount does not have to replay the journal | ||
85 | and it becomes very fast; | ||
86 | fast_unmount do not commit on unmount; this option makes | ||
87 | unmount faster, but the next mount slower | ||
88 | because of the need to replay the journal. | ||
89 | |||
90 | |||
91 | Quick usage instructions | ||
92 | ======================== | ||
93 | |||
94 | The UBI volume to mount is specified using "ubiX_Y" or "ubiX:NAME" syntax, | ||
95 | where "X" is UBI device number, "Y" is UBI volume number, and "NAME" is | ||
96 | UBI volume name. | ||
97 | |||
98 | Mount volume 0 on UBI device 0 to /mnt/ubifs: | ||
99 | $ mount -t ubifs ubi0_0 /mnt/ubifs | ||
100 | |||
101 | Mount "rootfs" volume of UBI device 0 to /mnt/ubifs ("rootfs" is volume | ||
102 | name): | ||
103 | $ mount -t ubifs ubi0:rootfs /mnt/ubifs | ||
104 | |||
105 | The following is an example of the kernel boot arguments to attach mtd0 | ||
106 | to UBI and mount volume "rootfs": | ||
107 | ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs | ||
108 | |||
109 | |||
110 | Module Parameters for Debugging | ||
111 | =============================== | ||
112 | |||
113 | When UBIFS has been compiled with debugging enabled, there are 3 module | ||
114 | parameters that are available to control aspects of testing and debugging. | ||
115 | The parameters are unsigned integers where each bit controls an option. | ||
116 | The parameters are: | ||
117 | |||
118 | debug_msgs Selects which debug messages to display, as follows: | ||
119 | |||
120 | Message Type Flag value | ||
121 | |||
122 | General messages 1 | ||
123 | Journal messages 2 | ||
124 | Mount messages 4 | ||
125 | Commit messages 8 | ||
126 | LEB search messages 16 | ||
127 | Budgeting messages 32 | ||
128 | Garbage collection messages 64 | ||
129 | Tree Node Cache (TNC) messages 128 | ||
130 | LEB properties (lprops) messages 256 | ||
131 | Input/output messages 512 | ||
132 | Log messages 1024 | ||
133 | Scan messages 2048 | ||
134 | Recovery messages 4096 | ||
135 | |||
136 | debug_chks Selects extra checks that UBIFS can do while running: | ||
137 | |||
138 | Check Flag value | ||
139 | |||
140 | General checks 1 | ||
141 | Check Tree Node Cache (TNC) 2 | ||
142 | Check indexing tree size 4 | ||
143 | Check orphan area 8 | ||
144 | Check old indexing tree 16 | ||
145 | Check LEB properties (lprops) 32 | ||
146 | Check leaf nodes and inodes 64 | ||
147 | |||
148 | debug_tsts Selects a mode of testing, as follows: | ||
149 | |||
150 | Test mode Flag value | ||
151 | |||
152 | Force in-the-gaps method 2 | ||
153 | Failure mode for recovery testing 4 | ||
154 | |||
155 | For example, set debug_msgs to 5 to display General messages and Mount | ||
156 | messages. | ||
157 | |||
158 | |||
159 | References | ||
160 | ========== | ||
161 | |||
162 | UBIFS documentation and FAQ/HOWTO at the MTD web site: | ||
163 | http://www.linux-mtd.infradead.org/doc/ubifs.html | ||
164 | http://www.linux-mtd.infradead.org/faq/ubifs.html | ||
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt index 2d5e1e582e13..bbac4f1d9056 100644 --- a/Documentation/filesystems/vfat.txt +++ b/Documentation/filesystems/vfat.txt | |||
@@ -96,6 +96,14 @@ shortname=lower|win95|winnt|mixed | |||
96 | emulate the Windows 95 rule for create. | 96 | emulate the Windows 95 rule for create. |
97 | Default setting is `lower'. | 97 | Default setting is `lower'. |
98 | 98 | ||
99 | tz=UTC -- Interpret timestamps as UTC rather than local time. | ||
100 | This option disables the conversion of timestamps | ||
101 | between local time (as used by Windows on FAT) and UTC | ||
102 | (which Linux uses internally). This is particuluarly | ||
103 | useful when mounting devices (like digital cameras) | ||
104 | that are set to UTC in order to avoid the pitfalls of | ||
105 | local time. | ||
106 | |||
99 | <bool>: 0,1,yes,no,true,false | 107 | <bool>: 0,1,yes,no,true,false |
100 | 108 | ||
101 | TODO | 109 | TODO |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b7522c6cbae3..c4d348dabe94 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -143,7 +143,7 @@ struct file_system_type { | |||
143 | 143 | ||
144 | The get_sb() method has the following arguments: | 144 | The get_sb() method has the following arguments: |
145 | 145 | ||
146 | struct file_system_type *fs_type: decribes the filesystem, partly initialized | 146 | struct file_system_type *fs_type: describes the filesystem, partly initialized |
147 | by the specific filesystem code | 147 | by the specific filesystem code |
148 | 148 | ||
149 | int flags: mount flags | 149 | int flags: mount flags |
@@ -895,9 +895,9 @@ struct dentry_operations { | |||
895 | iput() yourself | 895 | iput() yourself |
896 | 896 | ||
897 | d_dname: called when the pathname of a dentry should be generated. | 897 | d_dname: called when the pathname of a dentry should be generated. |
898 | Usefull for some pseudo filesystems (sockfs, pipefs, ...) to delay | 898 | Useful for some pseudo filesystems (sockfs, pipefs, ...) to delay |
899 | pathname generation. (Instead of doing it when dentry is created, | 899 | pathname generation. (Instead of doing it when dentry is created, |
900 | its done only when the path is needed.). Real filesystems probably | 900 | it's done only when the path is needed.). Real filesystems probably |
901 | dont want to use it, because their dentries are present in global | 901 | dont want to use it, because their dentries are present in global |
902 | dcache hash, so their hash should be an invariant. As no lock is | 902 | dcache hash, so their hash should be an invariant. As no lock is |
903 | held, d_dname() should not try to modify the dentry itself, unless | 903 | held, d_dname() should not try to modify the dentry itself, unless |