diff options
Diffstat (limited to 'Documentation/filesystems/files.txt')
-rw-r--r-- | Documentation/filesystems/files.txt | 123 |
1 files changed, 123 insertions, 0 deletions
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt new file mode 100644 index 000000000000..8c206f4e0250 --- /dev/null +++ b/Documentation/filesystems/files.txt | |||
@@ -0,0 +1,123 @@ | |||
1 | File management in the Linux kernel | ||
2 | ----------------------------------- | ||
3 | |||
4 | This document describes how locking for files (struct file) | ||
5 | and file descriptor table (struct files) works. | ||
6 | |||
7 | Up until 2.6.12, the file descriptor table has been protected | ||
8 | with a lock (files->file_lock) and reference count (files->count). | ||
9 | ->file_lock protected accesses to all the file related fields | ||
10 | of the table. ->count was used for sharing the file descriptor | ||
11 | table between tasks cloned with CLONE_FILES flag. Typically | ||
12 | this would be the case for posix threads. As with the common | ||
13 | refcounting model in the kernel, the last task doing | ||
14 | a put_files_struct() frees the file descriptor (fd) table. | ||
15 | The files (struct file) themselves are protected using | ||
16 | reference count (->f_count). | ||
17 | |||
18 | In the new lock-free model of file descriptor management, | ||
19 | the reference counting is similar, but the locking is | ||
20 | based on RCU. The file descriptor table contains multiple | ||
21 | elements - the fd sets (open_fds and close_on_exec, the | ||
22 | array of file pointers, the sizes of the sets and the array | ||
23 | etc.). In order for the updates to appear atomic to | ||
24 | a lock-free reader, all the elements of the file descriptor | ||
25 | table are in a separate structure - struct fdtable. | ||
26 | files_struct contains a pointer to struct fdtable through | ||
27 | which the actual fd table is accessed. Initially the | ||
28 | fdtable is embedded in files_struct itself. On a subsequent | ||
29 | expansion of fdtable, a new fdtable structure is allocated | ||
30 | and files->fdtab points to the new structure. The fdtable | ||
31 | structure is freed with RCU and lock-free readers either | ||
32 | see the old fdtable or the new fdtable making the update | ||
33 | appear atomic. Here are the locking rules for | ||
34 | the fdtable structure - | ||
35 | |||
36 | 1. All references to the fdtable must be done through | ||
37 | the files_fdtable() macro : | ||
38 | |||
39 | struct fdtable *fdt; | ||
40 | |||
41 | rcu_read_lock(); | ||
42 | |||
43 | fdt = files_fdtable(files); | ||
44 | .... | ||
45 | if (n <= fdt->max_fds) | ||
46 | .... | ||
47 | ... | ||
48 | rcu_read_unlock(); | ||
49 | |||
50 | files_fdtable() uses rcu_dereference() macro which takes care of | ||
51 | the memory barrier requirements for lock-free dereference. | ||
52 | The fdtable pointer must be read within the read-side | ||
53 | critical section. | ||
54 | |||
55 | 2. Reading of the fdtable as described above must be protected | ||
56 | by rcu_read_lock()/rcu_read_unlock(). | ||
57 | |||
58 | 3. For any update to the the fd table, files->file_lock must | ||
59 | be held. | ||
60 | |||
61 | 4. To look up the file structure given an fd, a reader | ||
62 | must use either fcheck() or fcheck_files() APIs. These | ||
63 | take care of barrier requirements due to lock-free lookup. | ||
64 | An example : | ||
65 | |||
66 | struct file *file; | ||
67 | |||
68 | rcu_read_lock(); | ||
69 | file = fcheck(fd); | ||
70 | if (file) { | ||
71 | ... | ||
72 | } | ||
73 | .... | ||
74 | rcu_read_unlock(); | ||
75 | |||
76 | 5. Handling of the file structures is special. Since the look-up | ||
77 | of the fd (fget()/fget_light()) are lock-free, it is possible | ||
78 | that look-up may race with the last put() operation on the | ||
79 | file structure. This is avoided using the rcuref APIs | ||
80 | on ->f_count : | ||
81 | |||
82 | rcu_read_lock(); | ||
83 | file = fcheck_files(files, fd); | ||
84 | if (file) { | ||
85 | if (rcuref_inc_lf(&file->f_count)) | ||
86 | *fput_needed = 1; | ||
87 | else | ||
88 | /* Didn't get the reference, someone's freed */ | ||
89 | file = NULL; | ||
90 | } | ||
91 | rcu_read_unlock(); | ||
92 | .... | ||
93 | return file; | ||
94 | |||
95 | rcuref_inc_lf() detects if refcounts is already zero or | ||
96 | goes to zero during increment. If it does, we fail | ||
97 | fget()/fget_light(). | ||
98 | |||
99 | 6. Since both fdtable and file structures can be looked up | ||
100 | lock-free, they must be installed using rcu_assign_pointer() | ||
101 | API. If they are looked up lock-free, rcu_dereference() | ||
102 | must be used. However it is advisable to use files_fdtable() | ||
103 | and fcheck()/fcheck_files() which take care of these issues. | ||
104 | |||
105 | 7. While updating, the fdtable pointer must be looked up while | ||
106 | holding files->file_lock. If ->file_lock is dropped, then | ||
107 | another thread expand the files thereby creating a new | ||
108 | fdtable and making the earlier fdtable pointer stale. | ||
109 | For example : | ||
110 | |||
111 | spin_lock(&files->file_lock); | ||
112 | fd = locate_fd(files, file, start); | ||
113 | if (fd >= 0) { | ||
114 | /* locate_fd() may have expanded fdtable, load the ptr */ | ||
115 | fdt = files_fdtable(files); | ||
116 | FD_SET(fd, fdt->open_fds); | ||
117 | FD_CLR(fd, fdt->close_on_exec); | ||
118 | spin_unlock(&files->file_lock); | ||
119 | ..... | ||
120 | |||
121 | Since locate_fd() can drop ->file_lock (and reacquire ->file_lock), | ||
122 | the fdtable pointer (fdt) must be loaded after locate_fd(). | ||
123 | |||