diff options
Diffstat (limited to 'Documentation/filesystems')
23 files changed, 4001 insertions, 1108 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 52cd611277a3..8dd6db76171d 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
@@ -68,6 +68,8 @@ ncpfs.txt | |||
68 | - info on Novell Netware(tm) filesystem using NCP protocol. | 68 | - info on Novell Netware(tm) filesystem using NCP protocol. |
69 | nfsroot.txt | 69 | nfsroot.txt |
70 | - short guide on setting up a diskless box with NFS root filesystem. | 70 | - short guide on setting up a diskless box with NFS root filesystem. |
71 | nilfs2.txt | ||
72 | - info and mount options for the NILFS2 filesystem. | ||
71 | ntfs.txt | 73 | ntfs.txt |
72 | - info and mount options for the NTFS filesystem (Windows NT). | 74 | - info and mount options for the NTFS filesystem (Windows NT). |
73 | ocfs2.txt | 75 | ocfs2.txt |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index ec6a9392a173..3120f8dd2c31 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -437,8 +437,11 @@ grab BKL for cases when we close a file that had been opened r/w, but that | |||
437 | can and should be done using the internal locking with smaller critical areas). | 437 | can and should be done using the internal locking with smaller critical areas). |
438 | Current worst offender is ext2_get_block()... | 438 | Current worst offender is ext2_get_block()... |
439 | 439 | ||
440 | ->fasync() is a mess. This area needs a big cleanup and that will probably | 440 | ->fasync() is called without BKL protection, and is responsible for |
441 | affect locking. | 441 | maintaining the FASYNC bit in filp->f_flags. Most instances call |
442 | fasync_helper(), which does that maintenance, so it's not normally | ||
443 | something one needs to worry about. Return values > 0 will be mapped to | ||
444 | zero in the VFS layer. | ||
442 | 445 | ||
443 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would | 446 | ->readdir() and ->ioctl() on directories must be changed. Ideally we would |
444 | move ->readdir() to inode_operations and use a separate method for directory | 447 | move ->readdir() to inode_operations and use a separate method for directory |
@@ -502,23 +505,31 @@ prototypes: | |||
502 | void (*open)(struct vm_area_struct*); | 505 | void (*open)(struct vm_area_struct*); |
503 | void (*close)(struct vm_area_struct*); | 506 | void (*close)(struct vm_area_struct*); |
504 | int (*fault)(struct vm_area_struct*, struct vm_fault *); | 507 | int (*fault)(struct vm_area_struct*, struct vm_fault *); |
505 | int (*page_mkwrite)(struct vm_area_struct *, struct page *); | 508 | int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); |
506 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); | 509 | int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); |
507 | 510 | ||
508 | locking rules: | 511 | locking rules: |
509 | BKL mmap_sem PageLocked(page) | 512 | BKL mmap_sem PageLocked(page) |
510 | open: no yes | 513 | open: no yes |
511 | close: no yes | 514 | close: no yes |
512 | fault: no yes | 515 | fault: no yes can return with page locked |
513 | page_mkwrite: no yes no | 516 | page_mkwrite: no yes can return with page locked |
514 | access: no yes | 517 | access: no yes |
515 | 518 | ||
516 | ->page_mkwrite() is called when a previously read-only page is | 519 | ->fault() is called when a previously not present pte is about |
517 | about to become writeable. The file system is responsible for | 520 | to be faulted in. The filesystem must find and return the page associated |
518 | protecting against truncate races. Once appropriate action has been | 521 | with the passed in "pgoff" in the vm_fault structure. If it is possible that |
519 | taking to lock out truncate, the page range should be verified to be | 522 | the page may be truncated and/or invalidated, then the filesystem must lock |
520 | within i_size. The page mapping should also be checked that it is not | 523 | the page, then ensure it is not already truncated (the page lock will block |
521 | NULL. | 524 | subsequent truncate), and then return with VM_FAULT_LOCKED, and the page |
525 | locked. The VM will unlock the page. | ||
526 | |||
527 | ->page_mkwrite() is called when a previously read-only pte is | ||
528 | about to become writeable. The filesystem again must ensure that there are | ||
529 | no truncate/invalidate races, and then return with the page locked. If | ||
530 | the page has been truncated, the filesystem should not look up a new page | ||
531 | like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which | ||
532 | will cause the VM to retry the fault. | ||
522 | 533 | ||
523 | ->access() is called when get_user_pages() fails in | 534 | ->access() is called when get_user_pages() fails in |
524 | acces_process_vm(), typically used to debug a process through | 535 | acces_process_vm(), typically used to debug a process through |
diff --git a/Documentation/filesystems/caching/backend-api.txt b/Documentation/filesystems/caching/backend-api.txt new file mode 100644 index 000000000000..382d52cdaf2d --- /dev/null +++ b/Documentation/filesystems/caching/backend-api.txt | |||
@@ -0,0 +1,658 @@ | |||
1 | ========================== | ||
2 | FS-CACHE CACHE BACKEND API | ||
3 | ========================== | ||
4 | |||
5 | The FS-Cache system provides an API by which actual caches can be supplied to | ||
6 | FS-Cache for it to then serve out to network filesystems and other interested | ||
7 | parties. | ||
8 | |||
9 | This API is declared in <linux/fscache-cache.h>. | ||
10 | |||
11 | |||
12 | ==================================== | ||
13 | INITIALISING AND REGISTERING A CACHE | ||
14 | ==================================== | ||
15 | |||
16 | To start off, a cache definition must be initialised and registered for each | ||
17 | cache the backend wants to make available. For instance, CacheFS does this in | ||
18 | the fill_super() operation on mounting. | ||
19 | |||
20 | The cache definition (struct fscache_cache) should be initialised by calling: | ||
21 | |||
22 | void fscache_init_cache(struct fscache_cache *cache, | ||
23 | struct fscache_cache_ops *ops, | ||
24 | const char *idfmt, | ||
25 | ...); | ||
26 | |||
27 | Where: | ||
28 | |||
29 | (*) "cache" is a pointer to the cache definition; | ||
30 | |||
31 | (*) "ops" is a pointer to the table of operations that the backend supports on | ||
32 | this cache; and | ||
33 | |||
34 | (*) "idfmt" is a format and printf-style arguments for constructing a label | ||
35 | for the cache. | ||
36 | |||
37 | |||
38 | The cache should then be registered with FS-Cache by passing a pointer to the | ||
39 | previously initialised cache definition to: | ||
40 | |||
41 | int fscache_add_cache(struct fscache_cache *cache, | ||
42 | struct fscache_object *fsdef, | ||
43 | const char *tagname); | ||
44 | |||
45 | Two extra arguments should also be supplied: | ||
46 | |||
47 | (*) "fsdef" which should point to the object representation for the FS-Cache | ||
48 | master index in this cache. Netfs primary index entries will be created | ||
49 | here. FS-Cache keeps the caller's reference to the index object if | ||
50 | successful and will release it upon withdrawal of the cache. | ||
51 | |||
52 | (*) "tagname" which, if given, should be a text string naming this cache. If | ||
53 | this is NULL, the identifier will be used instead. For CacheFS, the | ||
54 | identifier is set to name the underlying block device and the tag can be | ||
55 | supplied by mount. | ||
56 | |||
57 | This function may return -ENOMEM if it ran out of memory or -EEXIST if the tag | ||
58 | is already in use. 0 will be returned on success. | ||
59 | |||
60 | |||
61 | ===================== | ||
62 | UNREGISTERING A CACHE | ||
63 | ===================== | ||
64 | |||
65 | A cache can be withdrawn from the system by calling this function with a | ||
66 | pointer to the cache definition: | ||
67 | |||
68 | void fscache_withdraw_cache(struct fscache_cache *cache); | ||
69 | |||
70 | In CacheFS's case, this is called by put_super(). | ||
71 | |||
72 | |||
73 | ======== | ||
74 | SECURITY | ||
75 | ======== | ||
76 | |||
77 | The cache methods are executed one of two contexts: | ||
78 | |||
79 | (1) that of the userspace process that issued the netfs operation that caused | ||
80 | the cache method to be invoked, or | ||
81 | |||
82 | (2) that of one of the processes in the FS-Cache thread pool. | ||
83 | |||
84 | In either case, this may not be an appropriate context in which to access the | ||
85 | cache. | ||
86 | |||
87 | The calling process's fsuid, fsgid and SELinux security identities may need to | ||
88 | be masqueraded for the duration of the cache driver's access to the cache. | ||
89 | This is left to the cache to handle; FS-Cache makes no effort in this regard. | ||
90 | |||
91 | |||
92 | =================================== | ||
93 | CONTROL AND STATISTICS PRESENTATION | ||
94 | =================================== | ||
95 | |||
96 | The cache may present data to the outside world through FS-Cache's interfaces | ||
97 | in sysfs and procfs - the former for control and the latter for statistics. | ||
98 | |||
99 | A sysfs directory called /sys/fs/fscache/<cachetag>/ is created if CONFIG_SYSFS | ||
100 | is enabled. This is accessible through the kobject struct fscache_cache::kobj | ||
101 | and is for use by the cache as it sees fit. | ||
102 | |||
103 | |||
104 | ======================== | ||
105 | RELEVANT DATA STRUCTURES | ||
106 | ======================== | ||
107 | |||
108 | (*) Index/Data file FS-Cache representation cookie: | ||
109 | |||
110 | struct fscache_cookie { | ||
111 | struct fscache_object_def *def; | ||
112 | struct fscache_netfs *netfs; | ||
113 | void *netfs_data; | ||
114 | ... | ||
115 | }; | ||
116 | |||
117 | The fields that might be of use to the backend describe the object | ||
118 | definition, the netfs definition and the netfs's data for this cookie. | ||
119 | The object definition contain functions supplied by the netfs for loading | ||
120 | and matching index entries; these are required to provide some of the | ||
121 | cache operations. | ||
122 | |||
123 | |||
124 | (*) In-cache object representation: | ||
125 | |||
126 | struct fscache_object { | ||
127 | int debug_id; | ||
128 | enum { | ||
129 | FSCACHE_OBJECT_RECYCLING, | ||
130 | ... | ||
131 | } state; | ||
132 | spinlock_t lock | ||
133 | struct fscache_cache *cache; | ||
134 | struct fscache_cookie *cookie; | ||
135 | ... | ||
136 | }; | ||
137 | |||
138 | Structures of this type should be allocated by the cache backend and | ||
139 | passed to FS-Cache when requested by the appropriate cache operation. In | ||
140 | the case of CacheFS, they're embedded in CacheFS's internal object | ||
141 | structures. | ||
142 | |||
143 | The debug_id is a simple integer that can be used in debugging messages | ||
144 | that refer to a particular object. In such a case it should be printed | ||
145 | using "OBJ%x" to be consistent with FS-Cache. | ||
146 | |||
147 | Each object contains a pointer to the cookie that represents the object it | ||
148 | is backing. An object should retired when put_object() is called if it is | ||
149 | in state FSCACHE_OBJECT_RECYCLING. The fscache_object struct should be | ||
150 | initialised by calling fscache_object_init(object). | ||
151 | |||
152 | |||
153 | (*) FS-Cache operation record: | ||
154 | |||
155 | struct fscache_operation { | ||
156 | atomic_t usage; | ||
157 | struct fscache_object *object; | ||
158 | unsigned long flags; | ||
159 | #define FSCACHE_OP_EXCLUSIVE | ||
160 | void (*processor)(struct fscache_operation *op); | ||
161 | void (*release)(struct fscache_operation *op); | ||
162 | ... | ||
163 | }; | ||
164 | |||
165 | FS-Cache has a pool of threads that it uses to give CPU time to the | ||
166 | various asynchronous operations that need to be done as part of driving | ||
167 | the cache. These are represented by the above structure. The processor | ||
168 | method is called to give the op CPU time, and the release method to get | ||
169 | rid of it when its usage count reaches 0. | ||
170 | |||
171 | An operation can be made exclusive upon an object by setting the | ||
172 | appropriate flag before enqueuing it with fscache_enqueue_operation(). If | ||
173 | an operation needs more processing time, it should be enqueued again. | ||
174 | |||
175 | |||
176 | (*) FS-Cache retrieval operation record: | ||
177 | |||
178 | struct fscache_retrieval { | ||
179 | struct fscache_operation op; | ||
180 | struct address_space *mapping; | ||
181 | struct list_head *to_do; | ||
182 | ... | ||
183 | }; | ||
184 | |||
185 | A structure of this type is allocated by FS-Cache to record retrieval and | ||
186 | allocation requests made by the netfs. This struct is then passed to the | ||
187 | backend to do the operation. The backend may get extra refs to it by | ||
188 | calling fscache_get_retrieval() and refs may be discarded by calling | ||
189 | fscache_put_retrieval(). | ||
190 | |||
191 | A retrieval operation can be used by the backend to do retrieval work. To | ||
192 | do this, the retrieval->op.processor method pointer should be set | ||
193 | appropriately by the backend and fscache_enqueue_retrieval() called to | ||
194 | submit it to the thread pool. CacheFiles, for example, uses this to queue | ||
195 | page examination when it detects PG_lock being cleared. | ||
196 | |||
197 | The to_do field is an empty list available for the cache backend to use as | ||
198 | it sees fit. | ||
199 | |||
200 | |||
201 | (*) FS-Cache storage operation record: | ||
202 | |||
203 | struct fscache_storage { | ||
204 | struct fscache_operation op; | ||
205 | pgoff_t store_limit; | ||
206 | ... | ||
207 | }; | ||
208 | |||
209 | A structure of this type is allocated by FS-Cache to record outstanding | ||
210 | writes to be made. FS-Cache itself enqueues this operation and invokes | ||
211 | the write_page() method on the object at appropriate times to effect | ||
212 | storage. | ||
213 | |||
214 | |||
215 | ================ | ||
216 | CACHE OPERATIONS | ||
217 | ================ | ||
218 | |||
219 | The cache backend provides FS-Cache with a table of operations that can be | ||
220 | performed on the denizens of the cache. These are held in a structure of type: | ||
221 | |||
222 | struct fscache_cache_ops | ||
223 | |||
224 | (*) Name of cache provider [mandatory]: | ||
225 | |||
226 | const char *name | ||
227 | |||
228 | This isn't strictly an operation, but should be pointed at a string naming | ||
229 | the backend. | ||
230 | |||
231 | |||
232 | (*) Allocate a new object [mandatory]: | ||
233 | |||
234 | struct fscache_object *(*alloc_object)(struct fscache_cache *cache, | ||
235 | struct fscache_cookie *cookie) | ||
236 | |||
237 | This method is used to allocate a cache object representation to back a | ||
238 | cookie in a particular cache. fscache_object_init() should be called on | ||
239 | the object to initialise it prior to returning. | ||
240 | |||
241 | This function may also be used to parse the index key to be used for | ||
242 | multiple lookup calls to turn it into a more convenient form. FS-Cache | ||
243 | will call the lookup_complete() method to allow the cache to release the | ||
244 | form once lookup is complete or aborted. | ||
245 | |||
246 | |||
247 | (*) Look up and create object [mandatory]: | ||
248 | |||
249 | void (*lookup_object)(struct fscache_object *object) | ||
250 | |||
251 | This method is used to look up an object, given that the object is already | ||
252 | allocated and attached to the cookie. This should instantiate that object | ||
253 | in the cache if it can. | ||
254 | |||
255 | The method should call fscache_object_lookup_negative() as soon as | ||
256 | possible if it determines the object doesn't exist in the cache. If the | ||
257 | object is found to exist and the netfs indicates that it is valid then | ||
258 | fscache_obtained_object() should be called once the object is in a | ||
259 | position to have data stored in it. Similarly, fscache_obtained_object() | ||
260 | should also be called once a non-present object has been created. | ||
261 | |||
262 | If a lookup error occurs, fscache_object_lookup_error() should be called | ||
263 | to abort the lookup of that object. | ||
264 | |||
265 | |||
266 | (*) Release lookup data [mandatory]: | ||
267 | |||
268 | void (*lookup_complete)(struct fscache_object *object) | ||
269 | |||
270 | This method is called to ask the cache to release any resources it was | ||
271 | using to perform a lookup. | ||
272 | |||
273 | |||
274 | (*) Increment object refcount [mandatory]: | ||
275 | |||
276 | struct fscache_object *(*grab_object)(struct fscache_object *object) | ||
277 | |||
278 | This method is called to increment the reference count on an object. It | ||
279 | may fail (for instance if the cache is being withdrawn) by returning NULL. | ||
280 | It should return the object pointer if successful. | ||
281 | |||
282 | |||
283 | (*) Lock/Unlock object [mandatory]: | ||
284 | |||
285 | void (*lock_object)(struct fscache_object *object) | ||
286 | void (*unlock_object)(struct fscache_object *object) | ||
287 | |||
288 | These methods are used to exclusively lock an object. It must be possible | ||
289 | to schedule with the lock held, so a spinlock isn't sufficient. | ||
290 | |||
291 | |||
292 | (*) Pin/Unpin object [optional]: | ||
293 | |||
294 | int (*pin_object)(struct fscache_object *object) | ||
295 | void (*unpin_object)(struct fscache_object *object) | ||
296 | |||
297 | These methods are used to pin an object into the cache. Once pinned an | ||
298 | object cannot be reclaimed to make space. Return -ENOSPC if there's not | ||
299 | enough space in the cache to permit this. | ||
300 | |||
301 | |||
302 | (*) Update object [mandatory]: | ||
303 | |||
304 | int (*update_object)(struct fscache_object *object) | ||
305 | |||
306 | This is called to update the index entry for the specified object. The | ||
307 | new information should be in object->cookie->netfs_data. This can be | ||
308 | obtained by calling object->cookie->def->get_aux()/get_attr(). | ||
309 | |||
310 | |||
311 | (*) Discard object [mandatory]: | ||
312 | |||
313 | void (*drop_object)(struct fscache_object *object) | ||
314 | |||
315 | This method is called to indicate that an object has been unbound from its | ||
316 | cookie, and that the cache should release the object's resources and | ||
317 | retire it if it's in state FSCACHE_OBJECT_RECYCLING. | ||
318 | |||
319 | This method should not attempt to release any references held by the | ||
320 | caller. The caller will invoke the put_object() method as appropriate. | ||
321 | |||
322 | |||
323 | (*) Release object reference [mandatory]: | ||
324 | |||
325 | void (*put_object)(struct fscache_object *object) | ||
326 | |||
327 | This method is used to discard a reference to an object. The object may | ||
328 | be freed when all the references to it are released. | ||
329 | |||
330 | |||
331 | (*) Synchronise a cache [mandatory]: | ||
332 | |||
333 | void (*sync)(struct fscache_cache *cache) | ||
334 | |||
335 | This is called to ask the backend to synchronise a cache with its backing | ||
336 | device. | ||
337 | |||
338 | |||
339 | (*) Dissociate a cache [mandatory]: | ||
340 | |||
341 | void (*dissociate_pages)(struct fscache_cache *cache) | ||
342 | |||
343 | This is called to ask a cache to perform any page dissociations as part of | ||
344 | cache withdrawal. | ||
345 | |||
346 | |||
347 | (*) Notification that the attributes on a netfs file changed [mandatory]: | ||
348 | |||
349 | int (*attr_changed)(struct fscache_object *object); | ||
350 | |||
351 | This is called to indicate to the cache that certain attributes on a netfs | ||
352 | file have changed (for example the maximum size a file may reach). The | ||
353 | cache can read these from the netfs by calling the cookie's get_attr() | ||
354 | method. | ||
355 | |||
356 | The cache may use the file size information to reserve space on the cache. | ||
357 | It should also call fscache_set_store_limit() to indicate to FS-Cache the | ||
358 | highest byte it's willing to store for an object. | ||
359 | |||
360 | This method may return -ve if an error occurred or the cache object cannot | ||
361 | be expanded. In such a case, the object will be withdrawn from service. | ||
362 | |||
363 | This operation is run asynchronously from FS-Cache's thread pool, and | ||
364 | storage and retrieval operations from the netfs are excluded during the | ||
365 | execution of this operation. | ||
366 | |||
367 | |||
368 | (*) Reserve cache space for an object's data [optional]: | ||
369 | |||
370 | int (*reserve_space)(struct fscache_object *object, loff_t size); | ||
371 | |||
372 | This is called to request that cache space be reserved to hold the data | ||
373 | for an object and the metadata used to track it. Zero size should be | ||
374 | taken as request to cancel a reservation. | ||
375 | |||
376 | This should return 0 if successful, -ENOSPC if there isn't enough space | ||
377 | available, or -ENOMEM or -EIO on other errors. | ||
378 | |||
379 | The reservation may exceed the current size of the object, thus permitting | ||
380 | future expansion. If the amount of space consumed by an object would | ||
381 | exceed the reservation, it's permitted to refuse requests to allocate | ||
382 | pages, but not required. An object may be pruned down to its reservation | ||
383 | size if larger than that already. | ||
384 | |||
385 | |||
386 | (*) Request page be read from cache [mandatory]: | ||
387 | |||
388 | int (*read_or_alloc_page)(struct fscache_retrieval *op, | ||
389 | struct page *page, | ||
390 | gfp_t gfp) | ||
391 | |||
392 | This is called to attempt to read a netfs page from the cache, or to | ||
393 | reserve a backing block if not. FS-Cache will have done as much checking | ||
394 | as it can before calling, but most of the work belongs to the backend. | ||
395 | |||
396 | If there's no page in the cache, then -ENODATA should be returned if the | ||
397 | backend managed to reserve a backing block; -ENOBUFS or -ENOMEM if it | ||
398 | didn't. | ||
399 | |||
400 | If there is suitable data in the cache, then a read operation should be | ||
401 | queued and 0 returned. When the read finishes, fscache_end_io() should be | ||
402 | called. | ||
403 | |||
404 | The fscache_mark_pages_cached() should be called for the page if any cache | ||
405 | metadata is retained. This will indicate to the netfs that the page needs | ||
406 | explicit uncaching. This operation takes a pagevec, thus allowing several | ||
407 | pages to be marked at once. | ||
408 | |||
409 | The retrieval record pointed to by op should be retained for each page | ||
410 | queued and released when I/O on the page has been formally ended. | ||
411 | fscache_get/put_retrieval() are available for this purpose. | ||
412 | |||
413 | The retrieval record may be used to get CPU time via the FS-Cache thread | ||
414 | pool. If this is desired, the op->op.processor should be set to point to | ||
415 | the appropriate processing routine, and fscache_enqueue_retrieval() should | ||
416 | be called at an appropriate point to request CPU time. For instance, the | ||
417 | retrieval routine could be enqueued upon the completion of a disk read. | ||
418 | The to_do field in the retrieval record is provided to aid in this. | ||
419 | |||
420 | If an I/O error occurs, fscache_io_error() should be called and -ENOBUFS | ||
421 | returned if possible or fscache_end_io() called with a suitable error | ||
422 | code.. | ||
423 | |||
424 | |||
425 | (*) Request pages be read from cache [mandatory]: | ||
426 | |||
427 | int (*read_or_alloc_pages)(struct fscache_retrieval *op, | ||
428 | struct list_head *pages, | ||
429 | unsigned *nr_pages, | ||
430 | gfp_t gfp) | ||
431 | |||
432 | This is like the read_or_alloc_page() method, except it is handed a list | ||
433 | of pages instead of one page. Any pages on which a read operation is | ||
434 | started must be added to the page cache for the specified mapping and also | ||
435 | to the LRU. Such pages must also be removed from the pages list and | ||
436 | *nr_pages decremented per page. | ||
437 | |||
438 | If there was an error such as -ENOMEM, then that should be returned; else | ||
439 | if one or more pages couldn't be read or allocated, then -ENOBUFS should | ||
440 | be returned; else if one or more pages couldn't be read, then -ENODATA | ||
441 | should be returned. If all the pages are dispatched then 0 should be | ||
442 | returned. | ||
443 | |||
444 | |||
445 | (*) Request page be allocated in the cache [mandatory]: | ||
446 | |||
447 | int (*allocate_page)(struct fscache_retrieval *op, | ||
448 | struct page *page, | ||
449 | gfp_t gfp) | ||
450 | |||
451 | This is like the read_or_alloc_page() method, except that it shouldn't | ||
452 | read from the cache, even if there's data there that could be retrieved. | ||
453 | It should, however, set up any internal metadata required such that | ||
454 | the write_page() method can write to the cache. | ||
455 | |||
456 | If there's no backing block available, then -ENOBUFS should be returned | ||
457 | (or -ENOMEM if there were other problems). If a block is successfully | ||
458 | allocated, then the netfs page should be marked and 0 returned. | ||
459 | |||
460 | |||
461 | (*) Request pages be allocated in the cache [mandatory]: | ||
462 | |||
463 | int (*allocate_pages)(struct fscache_retrieval *op, | ||
464 | struct list_head *pages, | ||
465 | unsigned *nr_pages, | ||
466 | gfp_t gfp) | ||
467 | |||
468 | This is an multiple page version of the allocate_page() method. pages and | ||
469 | nr_pages should be treated as for the read_or_alloc_pages() method. | ||
470 | |||
471 | |||
472 | (*) Request page be written to cache [mandatory]: | ||
473 | |||
474 | int (*write_page)(struct fscache_storage *op, | ||
475 | struct page *page); | ||
476 | |||
477 | This is called to write from a page on which there was a previously | ||
478 | successful read_or_alloc_page() call or similar. FS-Cache filters out | ||
479 | pages that don't have mappings. | ||
480 | |||
481 | This method is called asynchronously from the FS-Cache thread pool. It is | ||
482 | not required to actually store anything, provided -ENODATA is then | ||
483 | returned to the next read of this page. | ||
484 | |||
485 | If an error occurred, then a negative error code should be returned, | ||
486 | otherwise zero should be returned. FS-Cache will take appropriate action | ||
487 | in response to an error, such as withdrawing this object. | ||
488 | |||
489 | If this method returns success then FS-Cache will inform the netfs | ||
490 | appropriately. | ||
491 | |||
492 | |||
493 | (*) Discard retained per-page metadata [mandatory]: | ||
494 | |||
495 | void (*uncache_page)(struct fscache_object *object, struct page *page) | ||
496 | |||
497 | This is called when a netfs page is being evicted from the pagecache. The | ||
498 | cache backend should tear down any internal representation or tracking it | ||
499 | maintains for this page. | ||
500 | |||
501 | |||
502 | ================== | ||
503 | FS-CACHE UTILITIES | ||
504 | ================== | ||
505 | |||
506 | FS-Cache provides some utilities that a cache backend may make use of: | ||
507 | |||
508 | (*) Note occurrence of an I/O error in a cache: | ||
509 | |||
510 | void fscache_io_error(struct fscache_cache *cache) | ||
511 | |||
512 | This tells FS-Cache that an I/O error occurred in the cache. After this | ||
513 | has been called, only resource dissociation operations (object and page | ||
514 | release) will be passed from the netfs to the cache backend for the | ||
515 | specified cache. | ||
516 | |||
517 | This does not actually withdraw the cache. That must be done separately. | ||
518 | |||
519 | |||
520 | (*) Invoke the retrieval I/O completion function: | ||
521 | |||
522 | void fscache_end_io(struct fscache_retrieval *op, struct page *page, | ||
523 | int error); | ||
524 | |||
525 | This is called to note the end of an attempt to retrieve a page. The | ||
526 | error value should be 0 if successful and an error otherwise. | ||
527 | |||
528 | |||
529 | (*) Set highest store limit: | ||
530 | |||
531 | void fscache_set_store_limit(struct fscache_object *object, | ||
532 | loff_t i_size); | ||
533 | |||
534 | This sets the limit FS-Cache imposes on the highest byte it's willing to | ||
535 | try and store for a netfs. Any page over this limit is automatically | ||
536 | rejected by fscache_read_alloc_page() and co with -ENOBUFS. | ||
537 | |||
538 | |||
539 | (*) Mark pages as being cached: | ||
540 | |||
541 | void fscache_mark_pages_cached(struct fscache_retrieval *op, | ||
542 | struct pagevec *pagevec); | ||
543 | |||
544 | This marks a set of pages as being cached. After this has been called, | ||
545 | the netfs must call fscache_uncache_page() to unmark the pages. | ||
546 | |||
547 | |||
548 | (*) Perform coherency check on an object: | ||
549 | |||
550 | enum fscache_checkaux fscache_check_aux(struct fscache_object *object, | ||
551 | const void *data, | ||
552 | uint16_t datalen); | ||
553 | |||
554 | This asks the netfs to perform a coherency check on an object that has | ||
555 | just been looked up. The cookie attached to the object will determine the | ||
556 | netfs to use. data and datalen should specify where the auxiliary data | ||
557 | retrieved from the cache can be found. | ||
558 | |||
559 | One of three values will be returned: | ||
560 | |||
561 | (*) FSCACHE_CHECKAUX_OKAY | ||
562 | |||
563 | The coherency data indicates the object is valid as is. | ||
564 | |||
565 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE | ||
566 | |||
567 | The coherency data needs updating, but otherwise the object is | ||
568 | valid. | ||
569 | |||
570 | (*) FSCACHE_CHECKAUX_OBSOLETE | ||
571 | |||
572 | The coherency data indicates that the object is obsolete and should | ||
573 | be discarded. | ||
574 | |||
575 | |||
576 | (*) Initialise a freshly allocated object: | ||
577 | |||
578 | void fscache_object_init(struct fscache_object *object); | ||
579 | |||
580 | This initialises all the fields in an object representation. | ||
581 | |||
582 | |||
583 | (*) Indicate the destruction of an object: | ||
584 | |||
585 | void fscache_object_destroyed(struct fscache_cache *cache); | ||
586 | |||
587 | This must be called to inform FS-Cache that an object that belonged to a | ||
588 | cache has been destroyed and deallocated. This will allow continuation | ||
589 | of the cache withdrawal process when it is stopped pending destruction of | ||
590 | all the objects. | ||
591 | |||
592 | |||
593 | (*) Indicate negative lookup on an object: | ||
594 | |||
595 | void fscache_object_lookup_negative(struct fscache_object *object); | ||
596 | |||
597 | This is called to indicate to FS-Cache that a lookup process for an object | ||
598 | found a negative result. | ||
599 | |||
600 | This changes the state of an object to permit reads pending on lookup | ||
601 | completion to go off and start fetching data from the netfs server as it's | ||
602 | known at this point that there can't be any data in the cache. | ||
603 | |||
604 | This may be called multiple times on an object. Only the first call is | ||
605 | significant - all subsequent calls are ignored. | ||
606 | |||
607 | |||
608 | (*) Indicate an object has been obtained: | ||
609 | |||
610 | void fscache_obtained_object(struct fscache_object *object); | ||
611 | |||
612 | This is called to indicate to FS-Cache that a lookup process for an object | ||
613 | produced a positive result, or that an object was created. This should | ||
614 | only be called once for any particular object. | ||
615 | |||
616 | This changes the state of an object to indicate: | ||
617 | |||
618 | (1) if no call to fscache_object_lookup_negative() has been made on | ||
619 | this object, that there may be data available, and that reads can | ||
620 | now go and look for it; and | ||
621 | |||
622 | (2) that writes may now proceed against this object. | ||
623 | |||
624 | |||
625 | (*) Indicate that object lookup failed: | ||
626 | |||
627 | void fscache_object_lookup_error(struct fscache_object *object); | ||
628 | |||
629 | This marks an object as having encountered a fatal error (usually EIO) | ||
630 | and causes it to move into a state whereby it will be withdrawn as soon | ||
631 | as possible. | ||
632 | |||
633 | |||
634 | (*) Get and release references on a retrieval record: | ||
635 | |||
636 | void fscache_get_retrieval(struct fscache_retrieval *op); | ||
637 | void fscache_put_retrieval(struct fscache_retrieval *op); | ||
638 | |||
639 | These two functions are used to retain a retrieval record whilst doing | ||
640 | asynchronous data retrieval and block allocation. | ||
641 | |||
642 | |||
643 | (*) Enqueue a retrieval record for processing. | ||
644 | |||
645 | void fscache_enqueue_retrieval(struct fscache_retrieval *op); | ||
646 | |||
647 | This enqueues a retrieval record for processing by the FS-Cache thread | ||
648 | pool. One of the threads in the pool will invoke the retrieval record's | ||
649 | op->op.processor callback function. This function may be called from | ||
650 | within the callback function. | ||
651 | |||
652 | |||
653 | (*) List of object state names: | ||
654 | |||
655 | const char *fscache_object_states[]; | ||
656 | |||
657 | For debugging purposes, this may be used to turn the state that an object | ||
658 | is in into a text string for display purposes. | ||
diff --git a/Documentation/filesystems/caching/cachefiles.txt b/Documentation/filesystems/caching/cachefiles.txt new file mode 100644 index 000000000000..748a1ae49e12 --- /dev/null +++ b/Documentation/filesystems/caching/cachefiles.txt | |||
@@ -0,0 +1,501 @@ | |||
1 | =============================================== | ||
2 | CacheFiles: CACHE ON ALREADY MOUNTED FILESYSTEM | ||
3 | =============================================== | ||
4 | |||
5 | Contents: | ||
6 | |||
7 | (*) Overview. | ||
8 | |||
9 | (*) Requirements. | ||
10 | |||
11 | (*) Configuration. | ||
12 | |||
13 | (*) Starting the cache. | ||
14 | |||
15 | (*) Things to avoid. | ||
16 | |||
17 | (*) Cache culling. | ||
18 | |||
19 | (*) Cache structure. | ||
20 | |||
21 | (*) Security model and SELinux. | ||
22 | |||
23 | (*) A note on security. | ||
24 | |||
25 | (*) Statistical information. | ||
26 | |||
27 | (*) Debugging. | ||
28 | |||
29 | |||
30 | ======== | ||
31 | OVERVIEW | ||
32 | ======== | ||
33 | |||
34 | CacheFiles is a caching backend that's meant to use as a cache a directory on | ||
35 | an already mounted filesystem of a local type (such as Ext3). | ||
36 | |||
37 | CacheFiles uses a userspace daemon to do some of the cache management - such as | ||
38 | reaping stale nodes and culling. This is called cachefilesd and lives in | ||
39 | /sbin. | ||
40 | |||
41 | The filesystem and data integrity of the cache are only as good as those of the | ||
42 | filesystem providing the backing services. Note that CacheFiles does not | ||
43 | attempt to journal anything since the journalling interfaces of the various | ||
44 | filesystems are very specific in nature. | ||
45 | |||
46 | CacheFiles creates a misc character device - "/dev/cachefiles" - that is used | ||
47 | to communication with the daemon. Only one thing may have this open at once, | ||
48 | and whilst it is open, a cache is at least partially in existence. The daemon | ||
49 | opens this and sends commands down it to control the cache. | ||
50 | |||
51 | CacheFiles is currently limited to a single cache. | ||
52 | |||
53 | CacheFiles attempts to maintain at least a certain percentage of free space on | ||
54 | the filesystem, shrinking the cache by culling the objects it contains to make | ||
55 | space if necessary - see the "Cache Culling" section. This means it can be | ||
56 | placed on the same medium as a live set of data, and will expand to make use of | ||
57 | spare space and automatically contract when the set of data requires more | ||
58 | space. | ||
59 | |||
60 | |||
61 | ============ | ||
62 | REQUIREMENTS | ||
63 | ============ | ||
64 | |||
65 | The use of CacheFiles and its daemon requires the following features to be | ||
66 | available in the system and in the cache filesystem: | ||
67 | |||
68 | - dnotify. | ||
69 | |||
70 | - extended attributes (xattrs). | ||
71 | |||
72 | - openat() and friends. | ||
73 | |||
74 | - bmap() support on files in the filesystem (FIBMAP ioctl). | ||
75 | |||
76 | - The use of bmap() to detect a partial page at the end of the file. | ||
77 | |||
78 | It is strongly recommended that the "dir_index" option is enabled on Ext3 | ||
79 | filesystems being used as a cache. | ||
80 | |||
81 | |||
82 | ============= | ||
83 | CONFIGURATION | ||
84 | ============= | ||
85 | |||
86 | The cache is configured by a script in /etc/cachefilesd.conf. These commands | ||
87 | set up cache ready for use. The following script commands are available: | ||
88 | |||
89 | (*) brun <N>% | ||
90 | (*) bcull <N>% | ||
91 | (*) bstop <N>% | ||
92 | (*) frun <N>% | ||
93 | (*) fcull <N>% | ||
94 | (*) fstop <N>% | ||
95 | |||
96 | Configure the culling limits. Optional. See the section on culling | ||
97 | The defaults are 7% (run), 5% (cull) and 1% (stop) respectively. | ||
98 | |||
99 | The commands beginning with a 'b' are file space (block) limits, those | ||
100 | beginning with an 'f' are file count limits. | ||
101 | |||
102 | (*) dir <path> | ||
103 | |||
104 | Specify the directory containing the root of the cache. Mandatory. | ||
105 | |||
106 | (*) tag <name> | ||
107 | |||
108 | Specify a tag to FS-Cache to use in distinguishing multiple caches. | ||
109 | Optional. The default is "CacheFiles". | ||
110 | |||
111 | (*) debug <mask> | ||
112 | |||
113 | Specify a numeric bitmask to control debugging in the kernel module. | ||
114 | Optional. The default is zero (all off). The following values can be | ||
115 | OR'd into the mask to collect various information: | ||
116 | |||
117 | 1 Turn on trace of function entry (_enter() macros) | ||
118 | 2 Turn on trace of function exit (_leave() macros) | ||
119 | 4 Turn on trace of internal debug points (_debug()) | ||
120 | |||
121 | This mask can also be set through sysfs, eg: | ||
122 | |||
123 | echo 5 >/sys/modules/cachefiles/parameters/debug | ||
124 | |||
125 | |||
126 | ================== | ||
127 | STARTING THE CACHE | ||
128 | ================== | ||
129 | |||
130 | The cache is started by running the daemon. The daemon opens the cache device, | ||
131 | configures the cache and tells it to begin caching. At that point the cache | ||
132 | binds to fscache and the cache becomes live. | ||
133 | |||
134 | The daemon is run as follows: | ||
135 | |||
136 | /sbin/cachefilesd [-d]* [-s] [-n] [-f <configfile>] | ||
137 | |||
138 | The flags are: | ||
139 | |||
140 | (*) -d | ||
141 | |||
142 | Increase the debugging level. This can be specified multiple times and | ||
143 | is cumulative with itself. | ||
144 | |||
145 | (*) -s | ||
146 | |||
147 | Send messages to stderr instead of syslog. | ||
148 | |||
149 | (*) -n | ||
150 | |||
151 | Don't daemonise and go into background. | ||
152 | |||
153 | (*) -f <configfile> | ||
154 | |||
155 | Use an alternative configuration file rather than the default one. | ||
156 | |||
157 | |||
158 | =============== | ||
159 | THINGS TO AVOID | ||
160 | =============== | ||
161 | |||
162 | Do not mount other things within the cache as this will cause problems. The | ||
163 | kernel module contains its own very cut-down path walking facility that ignores | ||
164 | mountpoints, but the daemon can't avoid them. | ||
165 | |||
166 | Do not create, rename or unlink files and directories in the cache whilst the | ||
167 | cache is active, as this may cause the state to become uncertain. | ||
168 | |||
169 | Renaming files in the cache might make objects appear to be other objects (the | ||
170 | filename is part of the lookup key). | ||
171 | |||
172 | Do not change or remove the extended attributes attached to cache files by the | ||
173 | cache as this will cause the cache state management to get confused. | ||
174 | |||
175 | Do not create files or directories in the cache, lest the cache get confused or | ||
176 | serve incorrect data. | ||
177 | |||
178 | Do not chmod files in the cache. The module creates things with minimal | ||
179 | permissions to prevent random users being able to access them directly. | ||
180 | |||
181 | |||
182 | ============= | ||
183 | CACHE CULLING | ||
184 | ============= | ||
185 | |||
186 | The cache may need culling occasionally to make space. This involves | ||
187 | discarding objects from the cache that have been used less recently than | ||
188 | anything else. Culling is based on the access time of data objects. Empty | ||
189 | directories are culled if not in use. | ||
190 | |||
191 | Cache culling is done on the basis of the percentage of blocks and the | ||
192 | percentage of files available in the underlying filesystem. There are six | ||
193 | "limits": | ||
194 | |||
195 | (*) brun | ||
196 | (*) frun | ||
197 | |||
198 | If the amount of free space and the number of available files in the cache | ||
199 | rises above both these limits, then culling is turned off. | ||
200 | |||
201 | (*) bcull | ||
202 | (*) fcull | ||
203 | |||
204 | If the amount of available space or the number of available files in the | ||
205 | cache falls below either of these limits, then culling is started. | ||
206 | |||
207 | (*) bstop | ||
208 | (*) fstop | ||
209 | |||
210 | If the amount of available space or the number of available files in the | ||
211 | cache falls below either of these limits, then no further allocation of | ||
212 | disk space or files is permitted until culling has raised things above | ||
213 | these limits again. | ||
214 | |||
215 | These must be configured thusly: | ||
216 | |||
217 | 0 <= bstop < bcull < brun < 100 | ||
218 | 0 <= fstop < fcull < frun < 100 | ||
219 | |||
220 | Note that these are percentages of available space and available files, and do | ||
221 | _not_ appear as 100 minus the percentage displayed by the "df" program. | ||
222 | |||
223 | The userspace daemon scans the cache to build up a table of cullable objects. | ||
224 | These are then culled in least recently used order. A new scan of the cache is | ||
225 | started as soon as space is made in the table. Objects will be skipped if | ||
226 | their atimes have changed or if the kernel module says it is still using them. | ||
227 | |||
228 | |||
229 | =============== | ||
230 | CACHE STRUCTURE | ||
231 | =============== | ||
232 | |||
233 | The CacheFiles module will create two directories in the directory it was | ||
234 | given: | ||
235 | |||
236 | (*) cache/ | ||
237 | |||
238 | (*) graveyard/ | ||
239 | |||
240 | The active cache objects all reside in the first directory. The CacheFiles | ||
241 | kernel module moves any retired or culled objects that it can't simply unlink | ||
242 | to the graveyard from which the daemon will actually delete them. | ||
243 | |||
244 | The daemon uses dnotify to monitor the graveyard directory, and will delete | ||
245 | anything that appears therein. | ||
246 | |||
247 | |||
248 | The module represents index objects as directories with the filename "I..." or | ||
249 | "J...". Note that the "cache/" directory is itself a special index. | ||
250 | |||
251 | Data objects are represented as files if they have no children, or directories | ||
252 | if they do. Their filenames all begin "D..." or "E...". If represented as a | ||
253 | directory, data objects will have a file in the directory called "data" that | ||
254 | actually holds the data. | ||
255 | |||
256 | Special objects are similar to data objects, except their filenames begin | ||
257 | "S..." or "T...". | ||
258 | |||
259 | |||
260 | If an object has children, then it will be represented as a directory. | ||
261 | Immediately in the representative directory are a collection of directories | ||
262 | named for hash values of the child object keys with an '@' prepended. Into | ||
263 | this directory, if possible, will be placed the representations of the child | ||
264 | objects: | ||
265 | |||
266 | INDEX INDEX INDEX DATA FILES | ||
267 | ========= ========== ================================= ================ | ||
268 | cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400 | ||
269 | cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...DB1ry | ||
270 | cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...N22ry | ||
271 | cache/@4a/I03nfs/@30/Ji000000000000000--fHg8hi8400/@75/Es0g000w...FP1ry | ||
272 | |||
273 | |||
274 | If the key is so long that it exceeds NAME_MAX with the decorations added on to | ||
275 | it, then it will be cut into pieces, the first few of which will be used to | ||
276 | make a nest of directories, and the last one of which will be the objects | ||
277 | inside the last directory. The names of the intermediate directories will have | ||
278 | '+' prepended: | ||
279 | |||
280 | J1223/@23/+xy...z/+kl...m/Epqr | ||
281 | |||
282 | |||
283 | Note that keys are raw data, and not only may they exceed NAME_MAX in size, | ||
284 | they may also contain things like '/' and NUL characters, and so they may not | ||
285 | be suitable for turning directly into a filename. | ||
286 | |||
287 | To handle this, CacheFiles will use a suitably printable filename directly and | ||
288 | "base-64" encode ones that aren't directly suitable. The two versions of | ||
289 | object filenames indicate the encoding: | ||
290 | |||
291 | OBJECT TYPE PRINTABLE ENCODED | ||
292 | =============== =============== =============== | ||
293 | Index "I..." "J..." | ||
294 | Data "D..." "E..." | ||
295 | Special "S..." "T..." | ||
296 | |||
297 | Intermediate directories are always "@" or "+" as appropriate. | ||
298 | |||
299 | |||
300 | Each object in the cache has an extended attribute label that holds the object | ||
301 | type ID (required to distinguish special objects) and the auxiliary data from | ||
302 | the netfs. The latter is used to detect stale objects in the cache and update | ||
303 | or retire them. | ||
304 | |||
305 | |||
306 | Note that CacheFiles will erase from the cache any file it doesn't recognise or | ||
307 | any file of an incorrect type (such as a FIFO file or a device file). | ||
308 | |||
309 | |||
310 | ========================== | ||
311 | SECURITY MODEL AND SELINUX | ||
312 | ========================== | ||
313 | |||
314 | CacheFiles is implemented to deal properly with the LSM security features of | ||
315 | the Linux kernel and the SELinux facility. | ||
316 | |||
317 | One of the problems that CacheFiles faces is that it is generally acting on | ||
318 | behalf of a process, and running in that process's context, and that includes a | ||
319 | security context that is not appropriate for accessing the cache - either | ||
320 | because the files in the cache are inaccessible to that process, or because if | ||
321 | the process creates a file in the cache, that file may be inaccessible to other | ||
322 | processes. | ||
323 | |||
324 | The way CacheFiles works is to temporarily change the security context (fsuid, | ||
325 | fsgid and actor security label) that the process acts as - without changing the | ||
326 | security context of the process when it the target of an operation performed by | ||
327 | some other process (so signalling and suchlike still work correctly). | ||
328 | |||
329 | |||
330 | When the CacheFiles module is asked to bind to its cache, it: | ||
331 | |||
332 | (1) Finds the security label attached to the root cache directory and uses | ||
333 | that as the security label with which it will create files. By default, | ||
334 | this is: | ||
335 | |||
336 | cachefiles_var_t | ||
337 | |||
338 | (2) Finds the security label of the process which issued the bind request | ||
339 | (presumed to be the cachefilesd daemon), which by default will be: | ||
340 | |||
341 | cachefilesd_t | ||
342 | |||
343 | and asks LSM to supply a security ID as which it should act given the | ||
344 | daemon's label. By default, this will be: | ||
345 | |||
346 | cachefiles_kernel_t | ||
347 | |||
348 | SELinux transitions the daemon's security ID to the module's security ID | ||
349 | based on a rule of this form in the policy. | ||
350 | |||
351 | type_transition <daemon's-ID> kernel_t : process <module's-ID>; | ||
352 | |||
353 | For instance: | ||
354 | |||
355 | type_transition cachefilesd_t kernel_t : process cachefiles_kernel_t; | ||
356 | |||
357 | |||
358 | The module's security ID gives it permission to create, move and remove files | ||
359 | and directories in the cache, to find and access directories and files in the | ||
360 | cache, to set and access extended attributes on cache objects, and to read and | ||
361 | write files in the cache. | ||
362 | |||
363 | The daemon's security ID gives it only a very restricted set of permissions: it | ||
364 | may scan directories, stat files and erase files and directories. It may | ||
365 | not read or write files in the cache, and so it is precluded from accessing the | ||
366 | data cached therein; nor is it permitted to create new files in the cache. | ||
367 | |||
368 | |||
369 | There are policy source files available in: | ||
370 | |||
371 | http://people.redhat.com/~dhowells/fscache/cachefilesd-0.8.tar.bz2 | ||
372 | |||
373 | and later versions. In that tarball, see the files: | ||
374 | |||
375 | cachefilesd.te | ||
376 | cachefilesd.fc | ||
377 | cachefilesd.if | ||
378 | |||
379 | They are built and installed directly by the RPM. | ||
380 | |||
381 | If a non-RPM based system is being used, then copy the above files to their own | ||
382 | directory and run: | ||
383 | |||
384 | make -f /usr/share/selinux/devel/Makefile | ||
385 | semodule -i cachefilesd.pp | ||
386 | |||
387 | You will need checkpolicy and selinux-policy-devel installed prior to the | ||
388 | build. | ||
389 | |||
390 | |||
391 | By default, the cache is located in /var/fscache, but if it is desirable that | ||
392 | it should be elsewhere, than either the above policy files must be altered, or | ||
393 | an auxiliary policy must be installed to label the alternate location of the | ||
394 | cache. | ||
395 | |||
396 | For instructions on how to add an auxiliary policy to enable the cache to be | ||
397 | located elsewhere when SELinux is in enforcing mode, please see: | ||
398 | |||
399 | /usr/share/doc/cachefilesd-*/move-cache.txt | ||
400 | |||
401 | When the cachefilesd rpm is installed; alternatively, the document can be found | ||
402 | in the sources. | ||
403 | |||
404 | |||
405 | ================== | ||
406 | A NOTE ON SECURITY | ||
407 | ================== | ||
408 | |||
409 | CacheFiles makes use of the split security in the task_struct. It allocates | ||
410 | its own task_security structure, and redirects current->cred to point to it | ||
411 | when it acts on behalf of another process, in that process's context. | ||
412 | |||
413 | The reason it does this is that it calls vfs_mkdir() and suchlike rather than | ||
414 | bypassing security and calling inode ops directly. Therefore the VFS and LSM | ||
415 | may deny the CacheFiles access to the cache data because under some | ||
416 | circumstances the caching code is running in the security context of whatever | ||
417 | process issued the original syscall on the netfs. | ||
418 | |||
419 | Furthermore, should CacheFiles create a file or directory, the security | ||
420 | parameters with that object is created (UID, GID, security label) would be | ||
421 | derived from that process that issued the system call, thus potentially | ||
422 | preventing other processes from accessing the cache - including CacheFiles's | ||
423 | cache management daemon (cachefilesd). | ||
424 | |||
425 | What is required is to temporarily override the security of the process that | ||
426 | issued the system call. We can't, however, just do an in-place change of the | ||
427 | security data as that affects the process as an object, not just as a subject. | ||
428 | This means it may lose signals or ptrace events for example, and affects what | ||
429 | the process looks like in /proc. | ||
430 | |||
431 | So CacheFiles makes use of a logical split in the security between the | ||
432 | objective security (task->real_cred) and the subjective security (task->cred). | ||
433 | The objective security holds the intrinsic security properties of a process and | ||
434 | is never overridden. This is what appears in /proc, and is what is used when a | ||
435 | process is the target of an operation by some other process (SIGKILL for | ||
436 | example). | ||
437 | |||
438 | The subjective security holds the active security properties of a process, and | ||
439 | may be overridden. This is not seen externally, and is used whan a process | ||
440 | acts upon another object, for example SIGKILLing another process or opening a | ||
441 | file. | ||
442 | |||
443 | LSM hooks exist that allow SELinux (or Smack or whatever) to reject a request | ||
444 | for CacheFiles to run in a context of a specific security label, or to create | ||
445 | files and directories with another security label. | ||
446 | |||
447 | |||
448 | ======================= | ||
449 | STATISTICAL INFORMATION | ||
450 | ======================= | ||
451 | |||
452 | If FS-Cache is compiled with the following option enabled: | ||
453 | |||
454 | CONFIG_CACHEFILES_HISTOGRAM=y | ||
455 | |||
456 | then it will gather certain statistics and display them through a proc file. | ||
457 | |||
458 | (*) /proc/fs/cachefiles/histogram | ||
459 | |||
460 | cat /proc/fs/cachefiles/histogram | ||
461 | JIFS SECS LOOKUPS MKDIRS CREATES | ||
462 | ===== ===== ========= ========= ========= | ||
463 | |||
464 | This shows the breakdown of the number of times each amount of time | ||
465 | between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The | ||
466 | columns are as follows: | ||
467 | |||
468 | COLUMN TIME MEASUREMENT | ||
469 | ======= ======================================================= | ||
470 | LOOKUPS Length of time to perform a lookup on the backing fs | ||
471 | MKDIRS Length of time to perform a mkdir on the backing fs | ||
472 | CREATES Length of time to perform a create on the backing fs | ||
473 | |||
474 | Each row shows the number of events that took a particular range of times. | ||
475 | Each step is 1 jiffy in size. The JIFS column indicates the particular | ||
476 | jiffy range covered, and the SECS field the equivalent number of seconds. | ||
477 | |||
478 | |||
479 | ========= | ||
480 | DEBUGGING | ||
481 | ========= | ||
482 | |||
483 | If CONFIG_CACHEFILES_DEBUG is enabled, the CacheFiles facility can have runtime | ||
484 | debugging enabled by adjusting the value in: | ||
485 | |||
486 | /sys/module/cachefiles/parameters/debug | ||
487 | |||
488 | This is a bitmask of debugging streams to enable: | ||
489 | |||
490 | BIT VALUE STREAM POINT | ||
491 | ======= ======= =============================== ======================= | ||
492 | 0 1 General Function entry trace | ||
493 | 1 2 Function exit trace | ||
494 | 2 4 General | ||
495 | |||
496 | The appropriate set of values should be OR'd together and the result written to | ||
497 | the control file. For example: | ||
498 | |||
499 | echo $((1|4|8)) >/sys/module/cachefiles/parameters/debug | ||
500 | |||
501 | will turn on all function entry debugging. | ||
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt new file mode 100644 index 000000000000..9e94b9491d89 --- /dev/null +++ b/Documentation/filesystems/caching/fscache.txt | |||
@@ -0,0 +1,333 @@ | |||
1 | ========================== | ||
2 | General Filesystem Caching | ||
3 | ========================== | ||
4 | |||
5 | ======== | ||
6 | OVERVIEW | ||
7 | ======== | ||
8 | |||
9 | This facility is a general purpose cache for network filesystems, though it | ||
10 | could be used for caching other things such as ISO9660 filesystems too. | ||
11 | |||
12 | FS-Cache mediates between cache backends (such as CacheFS) and network | ||
13 | filesystems: | ||
14 | |||
15 | +---------+ | ||
16 | | | +--------------+ | ||
17 | | NFS |--+ | | | ||
18 | | | | +-->| CacheFS | | ||
19 | +---------+ | +----------+ | | /dev/hda5 | | ||
20 | | | | | +--------------+ | ||
21 | +---------+ +-->| | | | ||
22 | | | | |--+ | ||
23 | | AFS |----->| FS-Cache | | ||
24 | | | | |--+ | ||
25 | +---------+ +-->| | | | ||
26 | | | | | +--------------+ | ||
27 | +---------+ | +----------+ | | | | ||
28 | | | | +-->| CacheFiles | | ||
29 | | ISOFS |--+ | /var/cache | | ||
30 | | | +--------------+ | ||
31 | +---------+ | ||
32 | |||
33 | Or to look at it another way, FS-Cache is a module that provides a caching | ||
34 | facility to a network filesystem such that the cache is transparent to the | ||
35 | user: | ||
36 | |||
37 | +---------+ | ||
38 | | | | ||
39 | | Server | | ||
40 | | | | ||
41 | +---------+ | ||
42 | | NETWORK | ||
43 | ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
44 | | | ||
45 | | +----------+ | ||
46 | V | | | ||
47 | +---------+ | | | ||
48 | | | | | | ||
49 | | NFS |----->| FS-Cache | | ||
50 | | | | |--+ | ||
51 | +---------+ | | | +--------------+ +--------------+ | ||
52 | | | | | | | | | | ||
53 | V +----------+ +-->| CacheFiles |-->| Ext3 | | ||
54 | +---------+ | /var/cache | | /dev/sda6 | | ||
55 | | | +--------------+ +--------------+ | ||
56 | | VFS | ^ ^ | ||
57 | | | | | | ||
58 | +---------+ +--------------+ | | ||
59 | | KERNEL SPACE | | | ||
60 | ~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|~~~~~~|~~~~ | ||
61 | | USER SPACE | | | ||
62 | V | | | ||
63 | +---------+ +--------------+ | ||
64 | | | | | | ||
65 | | Process | | cachefilesd | | ||
66 | | | | | | ||
67 | +---------+ +--------------+ | ||
68 | |||
69 | |||
70 | FS-Cache does not follow the idea of completely loading every netfs file | ||
71 | opened in its entirety into a cache before permitting it to be accessed and | ||
72 | then serving the pages out of that cache rather than the netfs inode because: | ||
73 | |||
74 | (1) It must be practical to operate without a cache. | ||
75 | |||
76 | (2) The size of any accessible file must not be limited to the size of the | ||
77 | cache. | ||
78 | |||
79 | (3) The combined size of all opened files (this includes mapped libraries) | ||
80 | must not be limited to the size of the cache. | ||
81 | |||
82 | (4) The user should not be forced to download an entire file just to do a | ||
83 | one-off access of a small portion of it (such as might be done with the | ||
84 | "file" program). | ||
85 | |||
86 | It instead serves the cache out in PAGE_SIZE chunks as and when requested by | ||
87 | the netfs('s) using it. | ||
88 | |||
89 | |||
90 | FS-Cache provides the following facilities: | ||
91 | |||
92 | (1) More than one cache can be used at once. Caches can be selected | ||
93 | explicitly by use of tags. | ||
94 | |||
95 | (2) Caches can be added / removed at any time. | ||
96 | |||
97 | (3) The netfs is provided with an interface that allows either party to | ||
98 | withdraw caching facilities from a file (required for (2)). | ||
99 | |||
100 | (4) The interface to the netfs returns as few errors as possible, preferring | ||
101 | rather to let the netfs remain oblivious. | ||
102 | |||
103 | (5) Cookies are used to represent indices, files and other objects to the | ||
104 | netfs. The simplest cookie is just a NULL pointer - indicating nothing | ||
105 | cached there. | ||
106 | |||
107 | (6) The netfs is allowed to propose - dynamically - any index hierarchy it | ||
108 | desires, though it must be aware that the index search function is | ||
109 | recursive, stack space is limited, and indices can only be children of | ||
110 | indices. | ||
111 | |||
112 | (7) Data I/O is done direct to and from the netfs's pages. The netfs | ||
113 | indicates that page A is at index B of the data-file represented by cookie | ||
114 | C, and that it should be read or written. The cache backend may or may | ||
115 | not start I/O on that page, but if it does, a netfs callback will be | ||
116 | invoked to indicate completion. The I/O may be either synchronous or | ||
117 | asynchronous. | ||
118 | |||
119 | (8) Cookies can be "retired" upon release. At this point FS-Cache will mark | ||
120 | them as obsolete and the index hierarchy rooted at that point will get | ||
121 | recycled. | ||
122 | |||
123 | (9) The netfs provides a "match" function for index searches. In addition to | ||
124 | saying whether a match was made or not, this can also specify that an | ||
125 | entry should be updated or deleted. | ||
126 | |||
127 | (10) As much as possible is done asynchronously. | ||
128 | |||
129 | |||
130 | FS-Cache maintains a virtual indexing tree in which all indices, files, objects | ||
131 | and pages are kept. Bits of this tree may actually reside in one or more | ||
132 | caches. | ||
133 | |||
134 | FSDEF | ||
135 | | | ||
136 | +------------------------------------+ | ||
137 | | | | ||
138 | NFS AFS | ||
139 | | | | ||
140 | +--------------------------+ +-----------+ | ||
141 | | | | | | ||
142 | homedir mirror afs.org redhat.com | ||
143 | | | | | ||
144 | +------------+ +---------------+ +----------+ | ||
145 | | | | | | | | ||
146 | 00001 00002 00007 00125 vol00001 vol00002 | ||
147 | | | | | | | ||
148 | +---+---+ +-----+ +---+ +------+------+ +-----+----+ | ||
149 | | | | | | | | | | | | | | | ||
150 | PG0 PG1 PG2 PG0 XATTR PG0 PG1 DIRENT DIRENT DIRENT R/W R/O Bak | ||
151 | | | | ||
152 | PG0 +-------+ | ||
153 | | | | ||
154 | 00001 00003 | ||
155 | | | ||
156 | +---+---+ | ||
157 | | | | | ||
158 | PG0 PG1 PG2 | ||
159 | |||
160 | In the example above, you can see two netfs's being backed: NFS and AFS. These | ||
161 | have different index hierarchies: | ||
162 | |||
163 | (*) The NFS primary index contains per-server indices. Each server index is | ||
164 | indexed by NFS file handles to get data file objects. Each data file | ||
165 | objects can have an array of pages, but may also have further child | ||
166 | objects, such as extended attributes and directory entries. Extended | ||
167 | attribute objects themselves have page-array contents. | ||
168 | |||
169 | (*) The AFS primary index contains per-cell indices. Each cell index contains | ||
170 | per-logical-volume indices. Each of volume index contains up to three | ||
171 | indices for the read-write, read-only and backup mirrors of those volumes. | ||
172 | Each of these contains vnode data file objects, each of which contains an | ||
173 | array of pages. | ||
174 | |||
175 | The very top index is the FS-Cache master index in which individual netfs's | ||
176 | have entries. | ||
177 | |||
178 | Any index object may reside in more than one cache, provided it only has index | ||
179 | children. Any index with non-index object children will be assumed to only | ||
180 | reside in one cache. | ||
181 | |||
182 | |||
183 | The netfs API to FS-Cache can be found in: | ||
184 | |||
185 | Documentation/filesystems/caching/netfs-api.txt | ||
186 | |||
187 | The cache backend API to FS-Cache can be found in: | ||
188 | |||
189 | Documentation/filesystems/caching/backend-api.txt | ||
190 | |||
191 | A description of the internal representations and object state machine can be | ||
192 | found in: | ||
193 | |||
194 | Documentation/filesystems/caching/object.txt | ||
195 | |||
196 | |||
197 | ======================= | ||
198 | STATISTICAL INFORMATION | ||
199 | ======================= | ||
200 | |||
201 | If FS-Cache is compiled with the following options enabled: | ||
202 | |||
203 | CONFIG_FSCACHE_STATS=y | ||
204 | CONFIG_FSCACHE_HISTOGRAM=y | ||
205 | |||
206 | then it will gather certain statistics and display them through a number of | ||
207 | proc files. | ||
208 | |||
209 | (*) /proc/fs/fscache/stats | ||
210 | |||
211 | This shows counts of a number of events that can happen in FS-Cache: | ||
212 | |||
213 | CLASS EVENT MEANING | ||
214 | ======= ======= ======================================================= | ||
215 | Cookies idx=N Number of index cookies allocated | ||
216 | dat=N Number of data storage cookies allocated | ||
217 | spc=N Number of special cookies allocated | ||
218 | Objects alc=N Number of objects allocated | ||
219 | nal=N Number of object allocation failures | ||
220 | avl=N Number of objects that reached the available state | ||
221 | ded=N Number of objects that reached the dead state | ||
222 | ChkAux non=N Number of objects that didn't have a coherency check | ||
223 | ok=N Number of objects that passed a coherency check | ||
224 | upd=N Number of objects that needed a coherency data update | ||
225 | obs=N Number of objects that were declared obsolete | ||
226 | Pages mrk=N Number of pages marked as being cached | ||
227 | unc=N Number of uncache page requests seen | ||
228 | Acquire n=N Number of acquire cookie requests seen | ||
229 | nul=N Number of acq reqs given a NULL parent | ||
230 | noc=N Number of acq reqs rejected due to no cache available | ||
231 | ok=N Number of acq reqs succeeded | ||
232 | nbf=N Number of acq reqs rejected due to error | ||
233 | oom=N Number of acq reqs failed on ENOMEM | ||
234 | Lookups n=N Number of lookup calls made on cache backends | ||
235 | neg=N Number of negative lookups made | ||
236 | pos=N Number of positive lookups made | ||
237 | crt=N Number of objects created by lookup | ||
238 | Updates n=N Number of update cookie requests seen | ||
239 | nul=N Number of upd reqs given a NULL parent | ||
240 | run=N Number of upd reqs granted CPU time | ||
241 | Relinqs n=N Number of relinquish cookie requests seen | ||
242 | nul=N Number of rlq reqs given a NULL parent | ||
243 | wcr=N Number of rlq reqs waited on completion of creation | ||
244 | AttrChg n=N Number of attribute changed requests seen | ||
245 | ok=N Number of attr changed requests queued | ||
246 | nbf=N Number of attr changed rejected -ENOBUFS | ||
247 | oom=N Number of attr changed failed -ENOMEM | ||
248 | run=N Number of attr changed ops given CPU time | ||
249 | Allocs n=N Number of allocation requests seen | ||
250 | ok=N Number of successful alloc reqs | ||
251 | wt=N Number of alloc reqs that waited on lookup completion | ||
252 | nbf=N Number of alloc reqs rejected -ENOBUFS | ||
253 | ops=N Number of alloc reqs submitted | ||
254 | owt=N Number of alloc reqs waited for CPU time | ||
255 | Retrvls n=N Number of retrieval (read) requests seen | ||
256 | ok=N Number of successful retr reqs | ||
257 | wt=N Number of retr reqs that waited on lookup completion | ||
258 | nod=N Number of retr reqs returned -ENODATA | ||
259 | nbf=N Number of retr reqs rejected -ENOBUFS | ||
260 | int=N Number of retr reqs aborted -ERESTARTSYS | ||
261 | oom=N Number of retr reqs failed -ENOMEM | ||
262 | ops=N Number of retr reqs submitted | ||
263 | owt=N Number of retr reqs waited for CPU time | ||
264 | Stores n=N Number of storage (write) requests seen | ||
265 | ok=N Number of successful store reqs | ||
266 | agn=N Number of store reqs on a page already pending storage | ||
267 | nbf=N Number of store reqs rejected -ENOBUFS | ||
268 | oom=N Number of store reqs failed -ENOMEM | ||
269 | ops=N Number of store reqs submitted | ||
270 | run=N Number of store reqs granted CPU time | ||
271 | Ops pend=N Number of times async ops added to pending queues | ||
272 | run=N Number of times async ops given CPU time | ||
273 | enq=N Number of times async ops queued for processing | ||
274 | dfr=N Number of async ops queued for deferred release | ||
275 | rel=N Number of async ops released | ||
276 | gc=N Number of deferred-release async ops garbage collected | ||
277 | |||
278 | |||
279 | (*) /proc/fs/fscache/histogram | ||
280 | |||
281 | cat /proc/fs/fscache/histogram | ||
282 | JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS | ||
283 | ===== ===== ========= ========= ========= ========= ========= | ||
284 | |||
285 | This shows the breakdown of the number of times each amount of time | ||
286 | between 0 jiffies and HZ-1 jiffies a variety of tasks took to run. The | ||
287 | columns are as follows: | ||
288 | |||
289 | COLUMN TIME MEASUREMENT | ||
290 | ======= ======================================================= | ||
291 | OBJ INST Length of time to instantiate an object | ||
292 | OP RUNS Length of time a call to process an operation took | ||
293 | OBJ RUNS Length of time a call to process an object event took | ||
294 | RETRV DLY Time between an requesting a read and lookup completing | ||
295 | RETRIEVLS Time between beginning and end of a retrieval | ||
296 | |||
297 | Each row shows the number of events that took a particular range of times. | ||
298 | Each step is 1 jiffy in size. The JIFS column indicates the particular | ||
299 | jiffy range covered, and the SECS field the equivalent number of seconds. | ||
300 | |||
301 | |||
302 | ========= | ||
303 | DEBUGGING | ||
304 | ========= | ||
305 | |||
306 | If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime | ||
307 | debugging enabled by adjusting the value in: | ||
308 | |||
309 | /sys/module/fscache/parameters/debug | ||
310 | |||
311 | This is a bitmask of debugging streams to enable: | ||
312 | |||
313 | BIT VALUE STREAM POINT | ||
314 | ======= ======= =============================== ======================= | ||
315 | 0 1 Cache management Function entry trace | ||
316 | 1 2 Function exit trace | ||
317 | 2 4 General | ||
318 | 3 8 Cookie management Function entry trace | ||
319 | 4 16 Function exit trace | ||
320 | 5 32 General | ||
321 | 6 64 Page handling Function entry trace | ||
322 | 7 128 Function exit trace | ||
323 | 8 256 General | ||
324 | 9 512 Operation management Function entry trace | ||
325 | 10 1024 Function exit trace | ||
326 | 11 2048 General | ||
327 | |||
328 | The appropriate set of values should be OR'd together and the result written to | ||
329 | the control file. For example: | ||
330 | |||
331 | echo $((1|8|64)) >/sys/module/fscache/parameters/debug | ||
332 | |||
333 | will turn on all function entry debugging. | ||
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt new file mode 100644 index 000000000000..4db125b3a5c6 --- /dev/null +++ b/Documentation/filesystems/caching/netfs-api.txt | |||
@@ -0,0 +1,778 @@ | |||
1 | =============================== | ||
2 | FS-CACHE NETWORK FILESYSTEM API | ||
3 | =============================== | ||
4 | |||
5 | There's an API by which a network filesystem can make use of the FS-Cache | ||
6 | facilities. This is based around a number of principles: | ||
7 | |||
8 | (1) Caches can store a number of different object types. There are two main | ||
9 | object types: indices and files. The first is a special type used by | ||
10 | FS-Cache to make finding objects faster and to make retiring of groups of | ||
11 | objects easier. | ||
12 | |||
13 | (2) Every index, file or other object is represented by a cookie. This cookie | ||
14 | may or may not have anything associated with it, but the netfs doesn't | ||
15 | need to care. | ||
16 | |||
17 | (3) Barring the top-level index (one entry per cached netfs), the index | ||
18 | hierarchy for each netfs is structured according the whim of the netfs. | ||
19 | |||
20 | This API is declared in <linux/fscache.h>. | ||
21 | |||
22 | This document contains the following sections: | ||
23 | |||
24 | (1) Network filesystem definition | ||
25 | (2) Index definition | ||
26 | (3) Object definition | ||
27 | (4) Network filesystem (un)registration | ||
28 | (5) Cache tag lookup | ||
29 | (6) Index registration | ||
30 | (7) Data file registration | ||
31 | (8) Miscellaneous object registration | ||
32 | (9) Setting the data file size | ||
33 | (10) Page alloc/read/write | ||
34 | (11) Page uncaching | ||
35 | (12) Index and data file update | ||
36 | (13) Miscellaneous cookie operations | ||
37 | (14) Cookie unregistration | ||
38 | (15) Index and data file invalidation | ||
39 | (16) FS-Cache specific page flags. | ||
40 | |||
41 | |||
42 | ============================= | ||
43 | NETWORK FILESYSTEM DEFINITION | ||
44 | ============================= | ||
45 | |||
46 | FS-Cache needs a description of the network filesystem. This is specified | ||
47 | using a record of the following structure: | ||
48 | |||
49 | struct fscache_netfs { | ||
50 | uint32_t version; | ||
51 | const char *name; | ||
52 | struct fscache_cookie *primary_index; | ||
53 | ... | ||
54 | }; | ||
55 | |||
56 | This first two fields should be filled in before registration, and the third | ||
57 | will be filled in by the registration function; any other fields should just be | ||
58 | ignored and are for internal use only. | ||
59 | |||
60 | The fields are: | ||
61 | |||
62 | (1) The name of the netfs (used as the key in the toplevel index). | ||
63 | |||
64 | (2) The version of the netfs (if the name matches but the version doesn't, the | ||
65 | entire in-cache hierarchy for this netfs will be scrapped and begun | ||
66 | afresh). | ||
67 | |||
68 | (3) The cookie representing the primary index will be allocated according to | ||
69 | another parameter passed into the registration function. | ||
70 | |||
71 | For example, kAFS (linux/fs/afs/) uses the following definitions to describe | ||
72 | itself: | ||
73 | |||
74 | struct fscache_netfs afs_cache_netfs = { | ||
75 | .version = 0, | ||
76 | .name = "afs", | ||
77 | }; | ||
78 | |||
79 | |||
80 | ================ | ||
81 | INDEX DEFINITION | ||
82 | ================ | ||
83 | |||
84 | Indices are used for two purposes: | ||
85 | |||
86 | (1) To aid the finding of a file based on a series of keys (such as AFS's | ||
87 | "cell", "volume ID", "vnode ID"). | ||
88 | |||
89 | (2) To make it easier to discard a subset of all the files cached based around | ||
90 | a particular key - for instance to mirror the removal of an AFS volume. | ||
91 | |||
92 | However, since it's unlikely that any two netfs's are going to want to define | ||
93 | their index hierarchies in quite the same way, FS-Cache tries to impose as few | ||
94 | restraints as possible on how an index is structured and where it is placed in | ||
95 | the tree. The netfs can even mix indices and data files at the same level, but | ||
96 | it's not recommended. | ||
97 | |||
98 | Each index entry consists of a key of indeterminate length plus some auxilliary | ||
99 | data, also of indeterminate length. | ||
100 | |||
101 | There are some limits on indices: | ||
102 | |||
103 | (1) Any index containing non-index objects should be restricted to a single | ||
104 | cache. Any such objects created within an index will be created in the | ||
105 | first cache only. The cache in which an index is created can be | ||
106 | controlled by cache tags (see below). | ||
107 | |||
108 | (2) The entry data must be atomically journallable, so it is limited to about | ||
109 | 400 bytes at present. At least 400 bytes will be available. | ||
110 | |||
111 | (3) The depth of the index tree should be judged with care as the search | ||
112 | function is recursive. Too many layers will run the kernel out of stack. | ||
113 | |||
114 | |||
115 | ================= | ||
116 | OBJECT DEFINITION | ||
117 | ================= | ||
118 | |||
119 | To define an object, a structure of the following type should be filled out: | ||
120 | |||
121 | struct fscache_cookie_def | ||
122 | { | ||
123 | uint8_t name[16]; | ||
124 | uint8_t type; | ||
125 | |||
126 | struct fscache_cache_tag *(*select_cache)( | ||
127 | const void *parent_netfs_data, | ||
128 | const void *cookie_netfs_data); | ||
129 | |||
130 | uint16_t (*get_key)(const void *cookie_netfs_data, | ||
131 | void *buffer, | ||
132 | uint16_t bufmax); | ||
133 | |||
134 | void (*get_attr)(const void *cookie_netfs_data, | ||
135 | uint64_t *size); | ||
136 | |||
137 | uint16_t (*get_aux)(const void *cookie_netfs_data, | ||
138 | void *buffer, | ||
139 | uint16_t bufmax); | ||
140 | |||
141 | enum fscache_checkaux (*check_aux)(void *cookie_netfs_data, | ||
142 | const void *data, | ||
143 | uint16_t datalen); | ||
144 | |||
145 | void (*get_context)(void *cookie_netfs_data, void *context); | ||
146 | |||
147 | void (*put_context)(void *cookie_netfs_data, void *context); | ||
148 | |||
149 | void (*mark_pages_cached)(void *cookie_netfs_data, | ||
150 | struct address_space *mapping, | ||
151 | struct pagevec *cached_pvec); | ||
152 | |||
153 | void (*now_uncached)(void *cookie_netfs_data); | ||
154 | }; | ||
155 | |||
156 | This has the following fields: | ||
157 | |||
158 | (1) The type of the object [mandatory]. | ||
159 | |||
160 | This is one of the following values: | ||
161 | |||
162 | (*) FSCACHE_COOKIE_TYPE_INDEX | ||
163 | |||
164 | This defines an index, which is a special FS-Cache type. | ||
165 | |||
166 | (*) FSCACHE_COOKIE_TYPE_DATAFILE | ||
167 | |||
168 | This defines an ordinary data file. | ||
169 | |||
170 | (*) Any other value between 2 and 255 | ||
171 | |||
172 | This defines an extraordinary object such as an XATTR. | ||
173 | |||
174 | (2) The name of the object type (NUL terminated unless all 16 chars are used) | ||
175 | [optional]. | ||
176 | |||
177 | (3) A function to select the cache in which to store an index [optional]. | ||
178 | |||
179 | This function is invoked when an index needs to be instantiated in a cache | ||
180 | during the instantiation of a non-index object. Only the immediate index | ||
181 | parent for the non-index object will be queried. Any indices above that | ||
182 | in the hierarchy may be stored in multiple caches. This function does not | ||
183 | need to be supplied for any non-index object or any index that will only | ||
184 | have index children. | ||
185 | |||
186 | If this function is not supplied or if it returns NULL then the first | ||
187 | cache in the parent's list will be chosed, or failing that, the first | ||
188 | cache in the master list. | ||
189 | |||
190 | (4) A function to retrieve an object's key from the netfs [mandatory]. | ||
191 | |||
192 | This function will be called with the netfs data that was passed to the | ||
193 | cookie acquisition function and the maximum length of key data that it may | ||
194 | provide. It should write the required key data into the given buffer and | ||
195 | return the quantity it wrote. | ||
196 | |||
197 | (5) A function to retrieve attribute data from the netfs [optional]. | ||
198 | |||
199 | This function will be called with the netfs data that was passed to the | ||
200 | cookie acquisition function. It should return the size of the file if | ||
201 | this is a data file. The size may be used to govern how much cache must | ||
202 | be reserved for this file in the cache. | ||
203 | |||
204 | If the function is absent, a file size of 0 is assumed. | ||
205 | |||
206 | (6) A function to retrieve auxilliary data from the netfs [optional]. | ||
207 | |||
208 | This function will be called with the netfs data that was passed to the | ||
209 | cookie acquisition function and the maximum length of auxilliary data that | ||
210 | it may provide. It should write the auxilliary data into the given buffer | ||
211 | and return the quantity it wrote. | ||
212 | |||
213 | If this function is absent, the auxilliary data length will be set to 0. | ||
214 | |||
215 | The length of the auxilliary data buffer may be dependent on the key | ||
216 | length. A netfs mustn't rely on being able to provide more than 400 bytes | ||
217 | for both. | ||
218 | |||
219 | (7) A function to check the auxilliary data [optional]. | ||
220 | |||
221 | This function will be called to check that a match found in the cache for | ||
222 | this object is valid. For instance with AFS it could check the auxilliary | ||
223 | data against the data version number returned by the server to determine | ||
224 | whether the index entry in a cache is still valid. | ||
225 | |||
226 | If this function is absent, it will be assumed that matching objects in a | ||
227 | cache are always valid. | ||
228 | |||
229 | If present, the function should return one of the following values: | ||
230 | |||
231 | (*) FSCACHE_CHECKAUX_OKAY - the entry is okay as is | ||
232 | (*) FSCACHE_CHECKAUX_NEEDS_UPDATE - the entry requires update | ||
233 | (*) FSCACHE_CHECKAUX_OBSOLETE - the entry should be deleted | ||
234 | |||
235 | This function can also be used to extract data from the auxilliary data in | ||
236 | the cache and copy it into the netfs's structures. | ||
237 | |||
238 | (8) A pair of functions to manage contexts for the completion callback | ||
239 | [optional]. | ||
240 | |||
241 | The cache read/write functions are passed a context which is then passed | ||
242 | to the I/O completion callback function. To ensure this context remains | ||
243 | valid until after the I/O completion is called, two functions may be | ||
244 | provided: one to get an extra reference on the context, and one to drop a | ||
245 | reference to it. | ||
246 | |||
247 | If the context is not used or is a type of object that won't go out of | ||
248 | scope, then these functions are not required. These functions are not | ||
249 | required for indices as indices may not contain data. These functions may | ||
250 | be called in interrupt context and so may not sleep. | ||
251 | |||
252 | (9) A function to mark a page as retaining cache metadata [optional]. | ||
253 | |||
254 | This is called by the cache to indicate that it is retaining in-memory | ||
255 | information for this page and that the netfs should uncache the page when | ||
256 | it has finished. This does not indicate whether there's data on the disk | ||
257 | or not. Note that several pages at once may be presented for marking. | ||
258 | |||
259 | The PG_fscache bit is set on the pages before this function would be | ||
260 | called, so the function need not be provided if this is sufficient. | ||
261 | |||
262 | This function is not required for indices as they're not permitted data. | ||
263 | |||
264 | (10) A function to unmark all the pages retaining cache metadata [mandatory]. | ||
265 | |||
266 | This is called by FS-Cache to indicate that a backing store is being | ||
267 | unbound from a cookie and that all the marks on the pages should be | ||
268 | cleared to prevent confusion. Note that the cache will have torn down all | ||
269 | its tracking information so that the pages don't need to be explicitly | ||
270 | uncached. | ||
271 | |||
272 | This function is not required for indices as they're not permitted data. | ||
273 | |||
274 | |||
275 | =================================== | ||
276 | NETWORK FILESYSTEM (UN)REGISTRATION | ||
277 | =================================== | ||
278 | |||
279 | The first step is to declare the network filesystem to the cache. This also | ||
280 | involves specifying the layout of the primary index (for AFS, this would be the | ||
281 | "cell" level). | ||
282 | |||
283 | The registration function is: | ||
284 | |||
285 | int fscache_register_netfs(struct fscache_netfs *netfs); | ||
286 | |||
287 | It just takes a pointer to the netfs definition. It returns 0 or an error as | ||
288 | appropriate. | ||
289 | |||
290 | For kAFS, registration is done as follows: | ||
291 | |||
292 | ret = fscache_register_netfs(&afs_cache_netfs); | ||
293 | |||
294 | The last step is, of course, unregistration: | ||
295 | |||
296 | void fscache_unregister_netfs(struct fscache_netfs *netfs); | ||
297 | |||
298 | |||
299 | ================ | ||
300 | CACHE TAG LOOKUP | ||
301 | ================ | ||
302 | |||
303 | FS-Cache permits the use of more than one cache. To permit particular index | ||
304 | subtrees to be bound to particular caches, the second step is to look up cache | ||
305 | representation tags. This step is optional; it can be left entirely up to | ||
306 | FS-Cache as to which cache should be used. The problem with doing that is that | ||
307 | FS-Cache will always pick the first cache that was registered. | ||
308 | |||
309 | To get the representation for a named tag: | ||
310 | |||
311 | struct fscache_cache_tag *fscache_lookup_cache_tag(const char *name); | ||
312 | |||
313 | This takes a text string as the name and returns a representation of a tag. It | ||
314 | will never return an error. It may return a dummy tag, however, if it runs out | ||
315 | of memory; this will inhibit caching with this tag. | ||
316 | |||
317 | Any representation so obtained must be released by passing it to this function: | ||
318 | |||
319 | void fscache_release_cache_tag(struct fscache_cache_tag *tag); | ||
320 | |||
321 | The tag will be retrieved by FS-Cache when it calls the object definition | ||
322 | operation select_cache(). | ||
323 | |||
324 | |||
325 | ================== | ||
326 | INDEX REGISTRATION | ||
327 | ================== | ||
328 | |||
329 | The third step is to inform FS-Cache about part of an index hierarchy that can | ||
330 | be used to locate files. This is done by requesting a cookie for each index in | ||
331 | the path to the file: | ||
332 | |||
333 | struct fscache_cookie * | ||
334 | fscache_acquire_cookie(struct fscache_cookie *parent, | ||
335 | const struct fscache_object_def *def, | ||
336 | void *netfs_data); | ||
337 | |||
338 | This function creates an index entry in the index represented by parent, | ||
339 | filling in the index entry by calling the operations pointed to by def. | ||
340 | |||
341 | Note that this function never returns an error - all errors are handled | ||
342 | internally. It may, however, return NULL to indicate no cookie. It is quite | ||
343 | acceptable to pass this token back to this function as the parent to another | ||
344 | acquisition (or even to the relinquish cookie, read page and write page | ||
345 | functions - see below). | ||
346 | |||
347 | Note also that no indices are actually created in a cache until a non-index | ||
348 | object needs to be created somewhere down the hierarchy. Furthermore, an index | ||
349 | may be created in several different caches independently at different times. | ||
350 | This is all handled transparently, and the netfs doesn't see any of it. | ||
351 | |||
352 | For example, with AFS, a cell would be added to the primary index. This index | ||
353 | entry would have a dependent inode containing a volume location index for the | ||
354 | volume mappings within this cell: | ||
355 | |||
356 | cell->cache = | ||
357 | fscache_acquire_cookie(afs_cache_netfs.primary_index, | ||
358 | &afs_cell_cache_index_def, | ||
359 | cell); | ||
360 | |||
361 | Then when a volume location was accessed, it would be entered into the cell's | ||
362 | index and an inode would be allocated that acts as a volume type and hash chain | ||
363 | combination: | ||
364 | |||
365 | vlocation->cache = | ||
366 | fscache_acquire_cookie(cell->cache, | ||
367 | &afs_vlocation_cache_index_def, | ||
368 | vlocation); | ||
369 | |||
370 | And then a particular flavour of volume (R/O for example) could be added to | ||
371 | that index, creating another index for vnodes (AFS inode equivalents): | ||
372 | |||
373 | volume->cache = | ||
374 | fscache_acquire_cookie(vlocation->cache, | ||
375 | &afs_volume_cache_index_def, | ||
376 | volume); | ||
377 | |||
378 | |||
379 | ====================== | ||
380 | DATA FILE REGISTRATION | ||
381 | ====================== | ||
382 | |||
383 | The fourth step is to request a data file be created in the cache. This is | ||
384 | identical to index cookie acquisition. The only difference is that the type in | ||
385 | the object definition should be something other than index type. | ||
386 | |||
387 | vnode->cache = | ||
388 | fscache_acquire_cookie(volume->cache, | ||
389 | &afs_vnode_cache_object_def, | ||
390 | vnode); | ||
391 | |||
392 | |||
393 | ================================= | ||
394 | MISCELLANEOUS OBJECT REGISTRATION | ||
395 | ================================= | ||
396 | |||
397 | An optional step is to request an object of miscellaneous type be created in | ||
398 | the cache. This is almost identical to index cookie acquisition. The only | ||
399 | difference is that the type in the object definition should be something other | ||
400 | than index type. Whilst the parent object could be an index, it's more likely | ||
401 | it would be some other type of object such as a data file. | ||
402 | |||
403 | xattr->cache = | ||
404 | fscache_acquire_cookie(vnode->cache, | ||
405 | &afs_xattr_cache_object_def, | ||
406 | xattr); | ||
407 | |||
408 | Miscellaneous objects might be used to store extended attributes or directory | ||
409 | entries for example. | ||
410 | |||
411 | |||
412 | ========================== | ||
413 | SETTING THE DATA FILE SIZE | ||
414 | ========================== | ||
415 | |||
416 | The fifth step is to set the physical attributes of the file, such as its size. | ||
417 | This doesn't automatically reserve any space in the cache, but permits the | ||
418 | cache to adjust its metadata for data tracking appropriately: | ||
419 | |||
420 | int fscache_attr_changed(struct fscache_cookie *cookie); | ||
421 | |||
422 | The cache will return -ENOBUFS if there is no backing cache or if there is no | ||
423 | space to allocate any extra metadata required in the cache. The attributes | ||
424 | will be accessed with the get_attr() cookie definition operation. | ||
425 | |||
426 | Note that attempts to read or write data pages in the cache over this size may | ||
427 | be rebuffed with -ENOBUFS. | ||
428 | |||
429 | This operation schedules an attribute adjustment to happen asynchronously at | ||
430 | some point in the future, and as such, it may happen after the function returns | ||
431 | to the caller. The attribute adjustment excludes read and write operations. | ||
432 | |||
433 | |||
434 | ===================== | ||
435 | PAGE READ/ALLOC/WRITE | ||
436 | ===================== | ||
437 | |||
438 | And the sixth step is to store and retrieve pages in the cache. There are | ||
439 | three functions that are used to do this. | ||
440 | |||
441 | Note: | ||
442 | |||
443 | (1) A page should not be re-read or re-allocated without uncaching it first. | ||
444 | |||
445 | (2) A read or allocated page must be uncached when the netfs page is released | ||
446 | from the pagecache. | ||
447 | |||
448 | (3) A page should only be written to the cache if previous read or allocated. | ||
449 | |||
450 | This permits the cache to maintain its page tracking in proper order. | ||
451 | |||
452 | |||
453 | PAGE READ | ||
454 | --------- | ||
455 | |||
456 | Firstly, the netfs should ask FS-Cache to examine the caches and read the | ||
457 | contents cached for a particular page of a particular file if present, or else | ||
458 | allocate space to store the contents if not: | ||
459 | |||
460 | typedef | ||
461 | void (*fscache_rw_complete_t)(struct page *page, | ||
462 | void *context, | ||
463 | int error); | ||
464 | |||
465 | int fscache_read_or_alloc_page(struct fscache_cookie *cookie, | ||
466 | struct page *page, | ||
467 | fscache_rw_complete_t end_io_func, | ||
468 | void *context, | ||
469 | gfp_t gfp); | ||
470 | |||
471 | The cookie argument must specify a cookie for an object that isn't an index, | ||
472 | the page specified will have the data loaded into it (and is also used to | ||
473 | specify the page number), and the gfp argument is used to control how any | ||
474 | memory allocations made are satisfied. | ||
475 | |||
476 | If the cookie indicates the inode is not cached: | ||
477 | |||
478 | (1) The function will return -ENOBUFS. | ||
479 | |||
480 | Else if there's a copy of the page resident in the cache: | ||
481 | |||
482 | (1) The mark_pages_cached() cookie operation will be called on that page. | ||
483 | |||
484 | (2) The function will submit a request to read the data from the cache's | ||
485 | backing device directly into the page specified. | ||
486 | |||
487 | (3) The function will return 0. | ||
488 | |||
489 | (4) When the read is complete, end_io_func() will be invoked with: | ||
490 | |||
491 | (*) The netfs data supplied when the cookie was created. | ||
492 | |||
493 | (*) The page descriptor. | ||
494 | |||
495 | (*) The context argument passed to the above function. This will be | ||
496 | maintained with the get_context/put_context functions mentioned above. | ||
497 | |||
498 | (*) An argument that's 0 on success or negative for an error code. | ||
499 | |||
500 | If an error occurs, it should be assumed that the page contains no usable | ||
501 | data. | ||
502 | |||
503 | end_io_func() will be called in process context if the read is results in | ||
504 | an error, but it might be called in interrupt context if the read is | ||
505 | successful. | ||
506 | |||
507 | Otherwise, if there's not a copy available in cache, but the cache may be able | ||
508 | to store the page: | ||
509 | |||
510 | (1) The mark_pages_cached() cookie operation will be called on that page. | ||
511 | |||
512 | (2) A block may be reserved in the cache and attached to the object at the | ||
513 | appropriate place. | ||
514 | |||
515 | (3) The function will return -ENODATA. | ||
516 | |||
517 | This function may also return -ENOMEM or -EINTR, in which case it won't have | ||
518 | read any data from the cache. | ||
519 | |||
520 | |||
521 | PAGE ALLOCATE | ||
522 | ------------- | ||
523 | |||
524 | Alternatively, if there's not expected to be any data in the cache for a page | ||
525 | because the file has been extended, a block can simply be allocated instead: | ||
526 | |||
527 | int fscache_alloc_page(struct fscache_cookie *cookie, | ||
528 | struct page *page, | ||
529 | gfp_t gfp); | ||
530 | |||
531 | This is similar to the fscache_read_or_alloc_page() function, except that it | ||
532 | never reads from the cache. It will return 0 if a block has been allocated, | ||
533 | rather than -ENODATA as the other would. One or the other must be performed | ||
534 | before writing to the cache. | ||
535 | |||
536 | The mark_pages_cached() cookie operation will be called on the page if | ||
537 | successful. | ||
538 | |||
539 | |||
540 | PAGE WRITE | ||
541 | ---------- | ||
542 | |||
543 | Secondly, if the netfs changes the contents of the page (either due to an | ||
544 | initial download or if a user performs a write), then the page should be | ||
545 | written back to the cache: | ||
546 | |||
547 | int fscache_write_page(struct fscache_cookie *cookie, | ||
548 | struct page *page, | ||
549 | gfp_t gfp); | ||
550 | |||
551 | The cookie argument must specify a data file cookie, the page specified should | ||
552 | contain the data to be written (and is also used to specify the page number), | ||
553 | and the gfp argument is used to control how any memory allocations made are | ||
554 | satisfied. | ||
555 | |||
556 | The page must have first been read or allocated successfully and must not have | ||
557 | been uncached before writing is performed. | ||
558 | |||
559 | If the cookie indicates the inode is not cached then: | ||
560 | |||
561 | (1) The function will return -ENOBUFS. | ||
562 | |||
563 | Else if space can be allocated in the cache to hold this page: | ||
564 | |||
565 | (1) PG_fscache_write will be set on the page. | ||
566 | |||
567 | (2) The function will submit a request to write the data to cache's backing | ||
568 | device directly from the page specified. | ||
569 | |||
570 | (3) The function will return 0. | ||
571 | |||
572 | (4) When the write is complete PG_fscache_write is cleared on the page and | ||
573 | anyone waiting for that bit will be woken up. | ||
574 | |||
575 | Else if there's no space available in the cache, -ENOBUFS will be returned. It | ||
576 | is also possible for the PG_fscache_write bit to be cleared when no write took | ||
577 | place if unforeseen circumstances arose (such as a disk error). | ||
578 | |||
579 | Writing takes place asynchronously. | ||
580 | |||
581 | |||
582 | MULTIPLE PAGE READ | ||
583 | ------------------ | ||
584 | |||
585 | A facility is provided to read several pages at once, as requested by the | ||
586 | readpages() address space operation: | ||
587 | |||
588 | int fscache_read_or_alloc_pages(struct fscache_cookie *cookie, | ||
589 | struct address_space *mapping, | ||
590 | struct list_head *pages, | ||
591 | int *nr_pages, | ||
592 | fscache_rw_complete_t end_io_func, | ||
593 | void *context, | ||
594 | gfp_t gfp); | ||
595 | |||
596 | This works in a similar way to fscache_read_or_alloc_page(), except: | ||
597 | |||
598 | (1) Any page it can retrieve data for is removed from pages and nr_pages and | ||
599 | dispatched for reading to the disk. Reads of adjacent pages on disk may | ||
600 | be merged for greater efficiency. | ||
601 | |||
602 | (2) The mark_pages_cached() cookie operation will be called on several pages | ||
603 | at once if they're being read or allocated. | ||
604 | |||
605 | (3) If there was an general error, then that error will be returned. | ||
606 | |||
607 | Else if some pages couldn't be allocated or read, then -ENOBUFS will be | ||
608 | returned. | ||
609 | |||
610 | Else if some pages couldn't be read but were allocated, then -ENODATA will | ||
611 | be returned. | ||
612 | |||
613 | Otherwise, if all pages had reads dispatched, then 0 will be returned, the | ||
614 | list will be empty and *nr_pages will be 0. | ||
615 | |||
616 | (4) end_io_func will be called once for each page being read as the reads | ||
617 | complete. It will be called in process context if error != 0, but it may | ||
618 | be called in interrupt context if there is no error. | ||
619 | |||
620 | Note that a return of -ENODATA, -ENOBUFS or any other error does not preclude | ||
621 | some of the pages being read and some being allocated. Those pages will have | ||
622 | been marked appropriately and will need uncaching. | ||
623 | |||
624 | |||
625 | ============== | ||
626 | PAGE UNCACHING | ||
627 | ============== | ||
628 | |||
629 | To uncache a page, this function should be called: | ||
630 | |||
631 | void fscache_uncache_page(struct fscache_cookie *cookie, | ||
632 | struct page *page); | ||
633 | |||
634 | This function permits the cache to release any in-memory representation it | ||
635 | might be holding for this netfs page. This function must be called once for | ||
636 | each page on which the read or write page functions above have been called to | ||
637 | make sure the cache's in-memory tracking information gets torn down. | ||
638 | |||
639 | Note that pages can't be explicitly deleted from the a data file. The whole | ||
640 | data file must be retired (see the relinquish cookie function below). | ||
641 | |||
642 | Furthermore, note that this does not cancel the asynchronous read or write | ||
643 | operation started by the read/alloc and write functions, so the page | ||
644 | invalidation and release functions must use: | ||
645 | |||
646 | bool fscache_check_page_write(struct fscache_cookie *cookie, | ||
647 | struct page *page); | ||
648 | |||
649 | to see if a page is being written to the cache, and: | ||
650 | |||
651 | void fscache_wait_on_page_write(struct fscache_cookie *cookie, | ||
652 | struct page *page); | ||
653 | |||
654 | to wait for it to finish if it is. | ||
655 | |||
656 | |||
657 | ========================== | ||
658 | INDEX AND DATA FILE UPDATE | ||
659 | ========================== | ||
660 | |||
661 | To request an update of the index data for an index or other object, the | ||
662 | following function should be called: | ||
663 | |||
664 | void fscache_update_cookie(struct fscache_cookie *cookie); | ||
665 | |||
666 | This function will refer back to the netfs_data pointer stored in the cookie by | ||
667 | the acquisition function to obtain the data to write into each revised index | ||
668 | entry. The update method in the parent index definition will be called to | ||
669 | transfer the data. | ||
670 | |||
671 | Note that partial updates may happen automatically at other times, such as when | ||
672 | data blocks are added to a data file object. | ||
673 | |||
674 | |||
675 | =============================== | ||
676 | MISCELLANEOUS COOKIE OPERATIONS | ||
677 | =============================== | ||
678 | |||
679 | There are a number of operations that can be used to control cookies: | ||
680 | |||
681 | (*) Cookie pinning: | ||
682 | |||
683 | int fscache_pin_cookie(struct fscache_cookie *cookie); | ||
684 | void fscache_unpin_cookie(struct fscache_cookie *cookie); | ||
685 | |||
686 | These operations permit data cookies to be pinned into the cache and to | ||
687 | have the pinning removed. They are not permitted on index cookies. | ||
688 | |||
689 | The pinning function will return 0 if successful, -ENOBUFS in the cookie | ||
690 | isn't backed by a cache, -EOPNOTSUPP if the cache doesn't support pinning, | ||
691 | -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or | ||
692 | -EIO if there's any other problem. | ||
693 | |||
694 | (*) Data space reservation: | ||
695 | |||
696 | int fscache_reserve_space(struct fscache_cookie *cookie, loff_t size); | ||
697 | |||
698 | This permits a netfs to request cache space be reserved to store up to the | ||
699 | given amount of a file. It is permitted to ask for more than the current | ||
700 | size of the file to allow for future file expansion. | ||
701 | |||
702 | If size is given as zero then the reservation will be cancelled. | ||
703 | |||
704 | The function will return 0 if successful, -ENOBUFS in the cookie isn't | ||
705 | backed by a cache, -EOPNOTSUPP if the cache doesn't support reservations, | ||
706 | -ENOSPC if there isn't enough space to honour the operation, -ENOMEM or | ||
707 | -EIO if there's any other problem. | ||
708 | |||
709 | Note that this doesn't pin an object in a cache; it can still be culled to | ||
710 | make space if it's not in use. | ||
711 | |||
712 | |||
713 | ===================== | ||
714 | COOKIE UNREGISTRATION | ||
715 | ===================== | ||
716 | |||
717 | To get rid of a cookie, this function should be called. | ||
718 | |||
719 | void fscache_relinquish_cookie(struct fscache_cookie *cookie, | ||
720 | int retire); | ||
721 | |||
722 | If retire is non-zero, then the object will be marked for recycling, and all | ||
723 | copies of it will be removed from all active caches in which it is present. | ||
724 | Not only that but all child objects will also be retired. | ||
725 | |||
726 | If retire is zero, then the object may be available again when next the | ||
727 | acquisition function is called. Retirement here will overrule the pinning on a | ||
728 | cookie. | ||
729 | |||
730 | One very important note - relinquish must NOT be called for a cookie unless all | ||
731 | the cookies for "child" indices, objects and pages have been relinquished | ||
732 | first. | ||
733 | |||
734 | |||
735 | ================================ | ||
736 | INDEX AND DATA FILE INVALIDATION | ||
737 | ================================ | ||
738 | |||
739 | There is no direct way to invalidate an index subtree or a data file. To do | ||
740 | this, the caller should relinquish and retire the cookie they have, and then | ||
741 | acquire a new one. | ||
742 | |||
743 | |||
744 | =========================== | ||
745 | FS-CACHE SPECIFIC PAGE FLAG | ||
746 | =========================== | ||
747 | |||
748 | FS-Cache makes use of a page flag, PG_private_2, for its own purpose. This is | ||
749 | given the alternative name PG_fscache. | ||
750 | |||
751 | PG_fscache is used to indicate that the page is known by the cache, and that | ||
752 | the cache must be informed if the page is going to go away. It's an indication | ||
753 | to the netfs that the cache has an interest in this page, where an interest may | ||
754 | be a pointer to it, resources allocated or reserved for it, or I/O in progress | ||
755 | upon it. | ||
756 | |||
757 | The netfs can use this information in methods such as releasepage() to | ||
758 | determine whether it needs to uncache a page or update it. | ||
759 | |||
760 | Furthermore, if this bit is set, releasepage() and invalidatepage() operations | ||
761 | will be called on a page to get rid of it, even if PG_private is not set. This | ||
762 | allows caching to attempted on a page before read_cache_pages() to be called | ||
763 | after fscache_read_or_alloc_pages() as the former will try and release pages it | ||
764 | was given under certain circumstances. | ||
765 | |||
766 | This bit does not overlap with such as PG_private. This means that FS-Cache | ||
767 | can be used with a filesystem that uses the block buffering code. | ||
768 | |||
769 | There are a number of operations defined on this flag: | ||
770 | |||
771 | int PageFsCache(struct page *page); | ||
772 | void SetPageFsCache(struct page *page) | ||
773 | void ClearPageFsCache(struct page *page) | ||
774 | int TestSetPageFsCache(struct page *page) | ||
775 | int TestClearPageFsCache(struct page *page) | ||
776 | |||
777 | These functions are bit test, bit set, bit clear, bit test and set and bit | ||
778 | test and clear operations on PG_fscache. | ||
diff --git a/Documentation/filesystems/caching/object.txt b/Documentation/filesystems/caching/object.txt new file mode 100644 index 000000000000..e8b0a35d8fe5 --- /dev/null +++ b/Documentation/filesystems/caching/object.txt | |||
@@ -0,0 +1,313 @@ | |||
1 | ==================================================== | ||
2 | IN-KERNEL CACHE OBJECT REPRESENTATION AND MANAGEMENT | ||
3 | ==================================================== | ||
4 | |||
5 | By: David Howells <dhowells@redhat.com> | ||
6 | |||
7 | Contents: | ||
8 | |||
9 | (*) Representation | ||
10 | |||
11 | (*) Object management state machine. | ||
12 | |||
13 | - Provision of cpu time. | ||
14 | - Locking simplification. | ||
15 | |||
16 | (*) The set of states. | ||
17 | |||
18 | (*) The set of events. | ||
19 | |||
20 | |||
21 | ============== | ||
22 | REPRESENTATION | ||
23 | ============== | ||
24 | |||
25 | FS-Cache maintains an in-kernel representation of each object that a netfs is | ||
26 | currently interested in. Such objects are represented by the fscache_cookie | ||
27 | struct and are referred to as cookies. | ||
28 | |||
29 | FS-Cache also maintains a separate in-kernel representation of the objects that | ||
30 | a cache backend is currently actively caching. Such objects are represented by | ||
31 | the fscache_object struct. The cache backends allocate these upon request, and | ||
32 | are expected to embed them in their own representations. These are referred to | ||
33 | as objects. | ||
34 | |||
35 | There is a 1:N relationship between cookies and objects. A cookie may be | ||
36 | represented by multiple objects - an index may exist in more than one cache - | ||
37 | or even by no objects (it may not be cached). | ||
38 | |||
39 | Furthermore, both cookies and objects are hierarchical. The two hierarchies | ||
40 | correspond, but the cookies tree is a superset of the union of the object trees | ||
41 | of multiple caches: | ||
42 | |||
43 | NETFS INDEX TREE : CACHE 1 : CACHE 2 | ||
44 | : : | ||
45 | : +-----------+ : | ||
46 | +----------->| IObject | : | ||
47 | +-----------+ | : +-----------+ : | ||
48 | | ICookie |-------+ : | : | ||
49 | +-----------+ | : | : +-----------+ | ||
50 | | +------------------------------>| IObject | | ||
51 | | : | : +-----------+ | ||
52 | | : V : | | ||
53 | | : +-----------+ : | | ||
54 | V +----------->| IObject | : | | ||
55 | +-----------+ | : +-----------+ : | | ||
56 | | ICookie |-------+ : | : V | ||
57 | +-----------+ | : | : +-----------+ | ||
58 | | +------------------------------>| IObject | | ||
59 | +-----+-----+ : | : +-----------+ | ||
60 | | | : | : | | ||
61 | V | : V : | | ||
62 | +-----------+ | : +-----------+ : | | ||
63 | | ICookie |------------------------->| IObject | : | | ||
64 | +-----------+ | : +-----------+ : | | ||
65 | | V : | : V | ||
66 | | +-----------+ : | : +-----------+ | ||
67 | | | ICookie |-------------------------------->| IObject | | ||
68 | | +-----------+ : | : +-----------+ | ||
69 | V | : V : | | ||
70 | +-----------+ | : +-----------+ : | | ||
71 | | DCookie |------------------------->| DObject | : | | ||
72 | +-----------+ | : +-----------+ : | | ||
73 | | : : | | ||
74 | +-------+-------+ : : | | ||
75 | | | : : | | ||
76 | V V : : V | ||
77 | +-----------+ +-----------+ : : +-----------+ | ||
78 | | DCookie | | DCookie |------------------------>| DObject | | ||
79 | +-----------+ +-----------+ : : +-----------+ | ||
80 | : : | ||
81 | |||
82 | In the above illustration, ICookie and IObject represent indices and DCookie | ||
83 | and DObject represent data storage objects. Indices may have representation in | ||
84 | multiple caches, but currently, non-index objects may not. Objects of any type | ||
85 | may also be entirely unrepresented. | ||
86 | |||
87 | As far as the netfs API goes, the netfs is only actually permitted to see | ||
88 | pointers to the cookies. The cookies themselves and any objects attached to | ||
89 | those cookies are hidden from it. | ||
90 | |||
91 | |||
92 | =============================== | ||
93 | OBJECT MANAGEMENT STATE MACHINE | ||
94 | =============================== | ||
95 | |||
96 | Within FS-Cache, each active object is managed by its own individual state | ||
97 | machine. The state for an object is kept in the fscache_object struct, in | ||
98 | object->state. A cookie may point to a set of objects that are in different | ||
99 | states. | ||
100 | |||
101 | Each state has an action associated with it that is invoked when the machine | ||
102 | wakes up in that state. There are four logical sets of states: | ||
103 | |||
104 | (1) Preparation: states that wait for the parent objects to become ready. The | ||
105 | representations are hierarchical, and it is expected that an object must | ||
106 | be created or accessed with respect to its parent object. | ||
107 | |||
108 | (2) Initialisation: states that perform lookups in the cache and validate | ||
109 | what's found and that create on disk any missing metadata. | ||
110 | |||
111 | (3) Normal running: states that allow netfs operations on objects to proceed | ||
112 | and that update the state of objects. | ||
113 | |||
114 | (4) Termination: states that detach objects from their netfs cookies, that | ||
115 | delete objects from disk, that handle disk and system errors and that free | ||
116 | up in-memory resources. | ||
117 | |||
118 | |||
119 | In most cases, transitioning between states is in response to signalled events. | ||
120 | When a state has finished processing, it will usually set the mask of events in | ||
121 | which it is interested (object->event_mask) and relinquish the worker thread. | ||
122 | Then when an event is raised (by calling fscache_raise_event()), if the event | ||
123 | is not masked, the object will be queued for processing (by calling | ||
124 | fscache_enqueue_object()). | ||
125 | |||
126 | |||
127 | PROVISION OF CPU TIME | ||
128 | --------------------- | ||
129 | |||
130 | The work to be done by the various states is given CPU time by the threads of | ||
131 | the slow work facility (see Documentation/slow-work.txt). This is used in | ||
132 | preference to the workqueue facility because: | ||
133 | |||
134 | (1) Threads may be completely occupied for very long periods of time by a | ||
135 | particular work item. These state actions may be doing sequences of | ||
136 | synchronous, journalled disk accesses (lookup, mkdir, create, setxattr, | ||
137 | getxattr, truncate, unlink, rmdir, rename). | ||
138 | |||
139 | (2) Threads may do little actual work, but may rather spend a lot of time | ||
140 | sleeping on I/O. This means that single-threaded and 1-per-CPU-threaded | ||
141 | workqueues don't necessarily have the right numbers of threads. | ||
142 | |||
143 | |||
144 | LOCKING SIMPLIFICATION | ||
145 | ---------------------- | ||
146 | |||
147 | Because only one worker thread may be operating on any particular object's | ||
148 | state machine at once, this simplifies the locking, particularly with respect | ||
149 | to disconnecting the netfs's representation of a cache object (fscache_cookie) | ||
150 | from the cache backend's representation (fscache_object) - which may be | ||
151 | requested from either end. | ||
152 | |||
153 | |||
154 | ================= | ||
155 | THE SET OF STATES | ||
156 | ================= | ||
157 | |||
158 | The object state machine has a set of states that it can be in. There are | ||
159 | preparation states in which the object sets itself up and waits for its parent | ||
160 | object to transit to a state that allows access to its children: | ||
161 | |||
162 | (1) State FSCACHE_OBJECT_INIT. | ||
163 | |||
164 | Initialise the object and wait for the parent object to become active. In | ||
165 | the cache, it is expected that it will not be possible to look an object | ||
166 | up from the parent object, until that parent object itself has been looked | ||
167 | up. | ||
168 | |||
169 | There are initialisation states in which the object sets itself up and accesses | ||
170 | disk for the object metadata: | ||
171 | |||
172 | (2) State FSCACHE_OBJECT_LOOKING_UP. | ||
173 | |||
174 | Look up the object on disk, using the parent as a starting point. | ||
175 | FS-Cache expects the cache backend to probe the cache to see whether this | ||
176 | object is represented there, and if it is, to see if it's valid (coherency | ||
177 | management). | ||
178 | |||
179 | The cache should call fscache_object_lookup_negative() to indicate lookup | ||
180 | failure for whatever reason, and should call fscache_obtained_object() to | ||
181 | indicate success. | ||
182 | |||
183 | At the completion of lookup, FS-Cache will let the netfs go ahead with | ||
184 | read operations, no matter whether the file is yet cached. If not yet | ||
185 | cached, read operations will be immediately rejected with ENODATA until | ||
186 | the first known page is uncached - as to that point there can be no data | ||
187 | to be read out of the cache for that file that isn't currently also held | ||
188 | in the pagecache. | ||
189 | |||
190 | (3) State FSCACHE_OBJECT_CREATING. | ||
191 | |||
192 | Create an object on disk, using the parent as a starting point. This | ||
193 | happens if the lookup failed to find the object, or if the object's | ||
194 | coherency data indicated what's on disk is out of date. In this state, | ||
195 | FS-Cache expects the cache to create | ||
196 | |||
197 | The cache should call fscache_obtained_object() if creation completes | ||
198 | successfully, fscache_object_lookup_negative() otherwise. | ||
199 | |||
200 | At the completion of creation, FS-Cache will start processing write | ||
201 | operations the netfs has queued for an object. If creation failed, the | ||
202 | write ops will be transparently discarded, and nothing recorded in the | ||
203 | cache. | ||
204 | |||
205 | There are some normal running states in which the object spends its time | ||
206 | servicing netfs requests: | ||
207 | |||
208 | (4) State FSCACHE_OBJECT_AVAILABLE. | ||
209 | |||
210 | A transient state in which pending operations are started, child objects | ||
211 | are permitted to advance from FSCACHE_OBJECT_INIT state, and temporary | ||
212 | lookup data is freed. | ||
213 | |||
214 | (5) State FSCACHE_OBJECT_ACTIVE. | ||
215 | |||
216 | The normal running state. In this state, requests the netfs makes will be | ||
217 | passed on to the cache. | ||
218 | |||
219 | (6) State FSCACHE_OBJECT_UPDATING. | ||
220 | |||
221 | The state machine comes here to update the object in the cache from the | ||
222 | netfs's records. This involves updating the auxiliary data that is used | ||
223 | to maintain coherency. | ||
224 | |||
225 | And there are terminal states in which an object cleans itself up, deallocates | ||
226 | memory and potentially deletes stuff from disk: | ||
227 | |||
228 | (7) State FSCACHE_OBJECT_LC_DYING. | ||
229 | |||
230 | The object comes here if it is dying because of a lookup or creation | ||
231 | error. This would be due to a disk error or system error of some sort. | ||
232 | Temporary data is cleaned up, and the parent is released. | ||
233 | |||
234 | (8) State FSCACHE_OBJECT_DYING. | ||
235 | |||
236 | The object comes here if it is dying due to an error, because its parent | ||
237 | cookie has been relinquished by the netfs or because the cache is being | ||
238 | withdrawn. | ||
239 | |||
240 | Any child objects waiting on this one are given CPU time so that they too | ||
241 | can destroy themselves. This object waits for all its children to go away | ||
242 | before advancing to the next state. | ||
243 | |||
244 | (9) State FSCACHE_OBJECT_ABORT_INIT. | ||
245 | |||
246 | The object comes to this state if it was waiting on its parent in | ||
247 | FSCACHE_OBJECT_INIT, but its parent died. The object will destroy itself | ||
248 | so that the parent may proceed from the FSCACHE_OBJECT_DYING state. | ||
249 | |||
250 | (10) State FSCACHE_OBJECT_RELEASING. | ||
251 | (11) State FSCACHE_OBJECT_RECYCLING. | ||
252 | |||
253 | The object comes to one of these two states when dying once it is rid of | ||
254 | all its children, if it is dying because the netfs relinquished its | ||
255 | cookie. In the first state, the cached data is expected to persist, and | ||
256 | in the second it will be deleted. | ||
257 | |||
258 | (12) State FSCACHE_OBJECT_WITHDRAWING. | ||
259 | |||
260 | The object transits to this state if the cache decides it wants to | ||
261 | withdraw the object from service, perhaps to make space, but also due to | ||
262 | error or just because the whole cache is being withdrawn. | ||
263 | |||
264 | (13) State FSCACHE_OBJECT_DEAD. | ||
265 | |||
266 | The object transits to this state when the in-memory object record is | ||
267 | ready to be deleted. The object processor shouldn't ever see an object in | ||
268 | this state. | ||
269 | |||
270 | |||
271 | THE SET OF EVENTS | ||
272 | ----------------- | ||
273 | |||
274 | There are a number of events that can be raised to an object state machine: | ||
275 | |||
276 | (*) FSCACHE_OBJECT_EV_UPDATE | ||
277 | |||
278 | The netfs requested that an object be updated. The state machine will ask | ||
279 | the cache backend to update the object, and the cache backend will ask the | ||
280 | netfs for details of the change through its cookie definition ops. | ||
281 | |||
282 | (*) FSCACHE_OBJECT_EV_CLEARED | ||
283 | |||
284 | This is signalled in two circumstances: | ||
285 | |||
286 | (a) when an object's last child object is dropped and | ||
287 | |||
288 | (b) when the last operation outstanding on an object is completed. | ||
289 | |||
290 | This is used to proceed from the dying state. | ||
291 | |||
292 | (*) FSCACHE_OBJECT_EV_ERROR | ||
293 | |||
294 | This is signalled when an I/O error occurs during the processing of some | ||
295 | object. | ||
296 | |||
297 | (*) FSCACHE_OBJECT_EV_RELEASE | ||
298 | (*) FSCACHE_OBJECT_EV_RETIRE | ||
299 | |||
300 | These are signalled when the netfs relinquishes a cookie it was using. | ||
301 | The event selected depends on whether the netfs asks for the backing | ||
302 | object to be retired (deleted) or retained. | ||
303 | |||
304 | (*) FSCACHE_OBJECT_EV_WITHDRAW | ||
305 | |||
306 | This is signalled when the cache backend wants to withdraw an object. | ||
307 | This means that the object will have to be detached from the netfs's | ||
308 | cookie. | ||
309 | |||
310 | Because the withdrawing releasing/retiring events are all handled by the object | ||
311 | state machine, it doesn't matter if there's a collision with both ends trying | ||
312 | to sever the connection at the same time. The state machine can just pick | ||
313 | which one it wants to honour, and that effects the other. | ||
diff --git a/Documentation/filesystems/caching/operations.txt b/Documentation/filesystems/caching/operations.txt new file mode 100644 index 000000000000..b6b070c57cbf --- /dev/null +++ b/Documentation/filesystems/caching/operations.txt | |||
@@ -0,0 +1,213 @@ | |||
1 | ================================ | ||
2 | ASYNCHRONOUS OPERATIONS HANDLING | ||
3 | ================================ | ||
4 | |||
5 | By: David Howells <dhowells@redhat.com> | ||
6 | |||
7 | Contents: | ||
8 | |||
9 | (*) Overview. | ||
10 | |||
11 | (*) Operation record initialisation. | ||
12 | |||
13 | (*) Parameters. | ||
14 | |||
15 | (*) Procedure. | ||
16 | |||
17 | (*) Asynchronous callback. | ||
18 | |||
19 | |||
20 | ======== | ||
21 | OVERVIEW | ||
22 | ======== | ||
23 | |||
24 | FS-Cache has an asynchronous operations handling facility that it uses for its | ||
25 | data storage and retrieval routines. Its operations are represented by | ||
26 | fscache_operation structs, though these are usually embedded into some other | ||
27 | structure. | ||
28 | |||
29 | This facility is available to and expected to be be used by the cache backends, | ||
30 | and FS-Cache will create operations and pass them off to the appropriate cache | ||
31 | backend for completion. | ||
32 | |||
33 | To make use of this facility, <linux/fscache-cache.h> should be #included. | ||
34 | |||
35 | |||
36 | =============================== | ||
37 | OPERATION RECORD INITIALISATION | ||
38 | =============================== | ||
39 | |||
40 | An operation is recorded in an fscache_operation struct: | ||
41 | |||
42 | struct fscache_operation { | ||
43 | union { | ||
44 | struct work_struct fast_work; | ||
45 | struct slow_work slow_work; | ||
46 | }; | ||
47 | unsigned long flags; | ||
48 | fscache_operation_processor_t processor; | ||
49 | ... | ||
50 | }; | ||
51 | |||
52 | Someone wanting to issue an operation should allocate something with this | ||
53 | struct embedded in it. They should initialise it by calling: | ||
54 | |||
55 | void fscache_operation_init(struct fscache_operation *op, | ||
56 | fscache_operation_release_t release); | ||
57 | |||
58 | with the operation to be initialised and the release function to use. | ||
59 | |||
60 | The op->flags parameter should be set to indicate the CPU time provision and | ||
61 | the exclusivity (see the Parameters section). | ||
62 | |||
63 | The op->fast_work, op->slow_work and op->processor flags should be set as | ||
64 | appropriate for the CPU time provision (see the Parameters section). | ||
65 | |||
66 | FSCACHE_OP_WAITING may be set in op->flags prior to each submission of the | ||
67 | operation and waited for afterwards. | ||
68 | |||
69 | |||
70 | ========== | ||
71 | PARAMETERS | ||
72 | ========== | ||
73 | |||
74 | There are a number of parameters that can be set in the operation record's flag | ||
75 | parameter. There are three options for the provision of CPU time in these | ||
76 | operations: | ||
77 | |||
78 | (1) The operation may be done synchronously (FSCACHE_OP_MYTHREAD). A thread | ||
79 | may decide it wants to handle an operation itself without deferring it to | ||
80 | another thread. | ||
81 | |||
82 | This is, for example, used in read operations for calling readpages() on | ||
83 | the backing filesystem in CacheFiles. Although readpages() does an | ||
84 | asynchronous data fetch, the determination of whether pages exist is done | ||
85 | synchronously - and the netfs does not proceed until this has been | ||
86 | determined. | ||
87 | |||
88 | If this option is to be used, FSCACHE_OP_WAITING must be set in op->flags | ||
89 | before submitting the operation, and the operating thread must wait for it | ||
90 | to be cleared before proceeding: | ||
91 | |||
92 | wait_on_bit(&op->flags, FSCACHE_OP_WAITING, | ||
93 | fscache_wait_bit, TASK_UNINTERRUPTIBLE); | ||
94 | |||
95 | |||
96 | (2) The operation may be fast asynchronous (FSCACHE_OP_FAST), in which case it | ||
97 | will be given to keventd to process. Such an operation is not permitted | ||
98 | to sleep on I/O. | ||
99 | |||
100 | This is, for example, used by CacheFiles to copy data from a backing fs | ||
101 | page to a netfs page after the backing fs has read the page in. | ||
102 | |||
103 | If this option is used, op->fast_work and op->processor must be | ||
104 | initialised before submitting the operation: | ||
105 | |||
106 | INIT_WORK(&op->fast_work, do_some_work); | ||
107 | |||
108 | |||
109 | (3) The operation may be slow asynchronous (FSCACHE_OP_SLOW), in which case it | ||
110 | will be given to the slow work facility to process. Such an operation is | ||
111 | permitted to sleep on I/O. | ||
112 | |||
113 | This is, for example, used by FS-Cache to handle background writes of | ||
114 | pages that have just been fetched from a remote server. | ||
115 | |||
116 | If this option is used, op->slow_work and op->processor must be | ||
117 | initialised before submitting the operation: | ||
118 | |||
119 | fscache_operation_init_slow(op, processor) | ||
120 | |||
121 | |||
122 | Furthermore, operations may be one of two types: | ||
123 | |||
124 | (1) Exclusive (FSCACHE_OP_EXCLUSIVE). Operations of this type may not run in | ||
125 | conjunction with any other operation on the object being operated upon. | ||
126 | |||
127 | An example of this is the attribute change operation, in which the file | ||
128 | being written to may need truncation. | ||
129 | |||
130 | (2) Shareable. Operations of this type may be running simultaneously. It's | ||
131 | up to the operation implementation to prevent interference between other | ||
132 | operations running at the same time. | ||
133 | |||
134 | |||
135 | ========= | ||
136 | PROCEDURE | ||
137 | ========= | ||
138 | |||
139 | Operations are used through the following procedure: | ||
140 | |||
141 | (1) The submitting thread must allocate the operation and initialise it | ||
142 | itself. Normally this would be part of a more specific structure with the | ||
143 | generic op embedded within. | ||
144 | |||
145 | (2) The submitting thread must then submit the operation for processing using | ||
146 | one of the following two functions: | ||
147 | |||
148 | int fscache_submit_op(struct fscache_object *object, | ||
149 | struct fscache_operation *op); | ||
150 | |||
151 | int fscache_submit_exclusive_op(struct fscache_object *object, | ||
152 | struct fscache_operation *op); | ||
153 | |||
154 | The first function should be used to submit non-exclusive ops and the | ||
155 | second to submit exclusive ones. The caller must still set the | ||
156 | FSCACHE_OP_EXCLUSIVE flag. | ||
157 | |||
158 | If successful, both functions will assign the operation to the specified | ||
159 | object and return 0. -ENOBUFS will be returned if the object specified is | ||
160 | permanently unavailable. | ||
161 | |||
162 | The operation manager will defer operations on an object that is still | ||
163 | undergoing lookup or creation. The operation will also be deferred if an | ||
164 | operation of conflicting exclusivity is in progress on the object. | ||
165 | |||
166 | If the operation is asynchronous, the manager will retain a reference to | ||
167 | it, so the caller should put their reference to it by passing it to: | ||
168 | |||
169 | void fscache_put_operation(struct fscache_operation *op); | ||
170 | |||
171 | (3) If the submitting thread wants to do the work itself, and has marked the | ||
172 | operation with FSCACHE_OP_MYTHREAD, then it should monitor | ||
173 | FSCACHE_OP_WAITING as described above and check the state of the object if | ||
174 | necessary (the object might have died whilst the thread was waiting). | ||
175 | |||
176 | When it has finished doing its processing, it should call | ||
177 | fscache_put_operation() on it. | ||
178 | |||
179 | (4) The operation holds an effective lock upon the object, preventing other | ||
180 | exclusive ops conflicting until it is released. The operation can be | ||
181 | enqueued for further immediate asynchronous processing by adjusting the | ||
182 | CPU time provisioning option if necessary, eg: | ||
183 | |||
184 | op->flags &= ~FSCACHE_OP_TYPE; | ||
185 | op->flags |= ~FSCACHE_OP_FAST; | ||
186 | |||
187 | and calling: | ||
188 | |||
189 | void fscache_enqueue_operation(struct fscache_operation *op) | ||
190 | |||
191 | This can be used to allow other things to have use of the worker thread | ||
192 | pools. | ||
193 | |||
194 | |||
195 | ===================== | ||
196 | ASYNCHRONOUS CALLBACK | ||
197 | ===================== | ||
198 | |||
199 | When used in asynchronous mode, the worker thread pool will invoke the | ||
200 | processor method with a pointer to the operation. This should then get at the | ||
201 | container struct by using container_of(): | ||
202 | |||
203 | static void fscache_write_op(struct fscache_operation *_op) | ||
204 | { | ||
205 | struct fscache_storage *op = | ||
206 | container_of(_op, struct fscache_storage, op); | ||
207 | ... | ||
208 | } | ||
209 | |||
210 | The caller holds a reference on the operation, and will invoke | ||
211 | fscache_put_operation() when the processor function returns. The processor | ||
212 | function is at liberty to call fscache_enqueue_operation() or to take extra | ||
213 | references. | ||
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt new file mode 100644 index 000000000000..0ced74c2f73c --- /dev/null +++ b/Documentation/filesystems/exofs.txt | |||
@@ -0,0 +1,176 @@ | |||
1 | =============================================================================== | ||
2 | WHAT IS EXOFS? | ||
3 | =============================================================================== | ||
4 | |||
5 | exofs is a file system that uses an OSD and exports the API of a normal Linux | ||
6 | file system. Users access exofs like any other local file system, and exofs | ||
7 | will in turn issue commands to the local OSD initiator. | ||
8 | |||
9 | OSD is a new T10 command set that views storage devices not as a large/flat | ||
10 | array of sectors but as a container of objects, each having a length, quota, | ||
11 | time attributes and more. Each object is addressed by a 64bit ID, and is | ||
12 | contained in a 64bit ID partition. Each object has associated attributes | ||
13 | attached to it, which are integral part of the object and provide metadata about | ||
14 | the object. The standard defines some common obligatory attributes, but user | ||
15 | attributes can be added as needed. | ||
16 | |||
17 | =============================================================================== | ||
18 | ENVIRONMENT | ||
19 | =============================================================================== | ||
20 | |||
21 | To use this file system, you need to have an object store to run it on. You | ||
22 | may download a target from: | ||
23 | http://open-osd.org | ||
24 | |||
25 | See Documentation/scsi/osd.txt for how to setup a working osd environment. | ||
26 | |||
27 | =============================================================================== | ||
28 | USAGE | ||
29 | =============================================================================== | ||
30 | |||
31 | 1. Download and compile exofs and open-osd initiator: | ||
32 | You need an external Kernel source tree or kernel headers from your | ||
33 | distribution. (anything based on 2.6.26 or later). | ||
34 | |||
35 | a. download open-osd including exofs source using: | ||
36 | [parent-directory]$ git clone git://git.open-osd.org/open-osd.git | ||
37 | |||
38 | b. Build the library module like this: | ||
39 | [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd | ||
40 | |||
41 | This will build both the open-osd initiator as well as the exofs kernel | ||
42 | module. Use whatever parameters you compiled your Kernel with and | ||
43 | $(KER_DIR) above pointing to the Kernel you compile against. See the file | ||
44 | open-osd/top-level-Makefile for an example. | ||
45 | |||
46 | 2. Get the OSD initiator and target set up properly, and login to the target. | ||
47 | See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd | ||
48 | for example script that does all these steps. | ||
49 | |||
50 | 3. Insmod the exofs.ko module: | ||
51 | [exofs]$ insmod exofs.ko | ||
52 | |||
53 | 4. Make sure the directory where you want to mount exists. If not, create it. | ||
54 | (For example, mkdir /mnt/exofs) | ||
55 | |||
56 | 5. At first run you will need to invoke the mkfs.exofs application | ||
57 | |||
58 | As an example, this will create the file system on: | ||
59 | /dev/osd0 partition ID 65536 | ||
60 | |||
61 | mkfs.exofs --pid=65536 --format /dev/osd0 | ||
62 | |||
63 | The --format is optional if not specified no OSD_FORMAT will be | ||
64 | preformed and a clean file system will be created in the specified pid, | ||
65 | in the available space of the target. (Use --format=size_in_meg to limit | ||
66 | the total LUN space available) | ||
67 | |||
68 | If pid already exist it will be deleted and a new one will be created in it's | ||
69 | place. Be careful. | ||
70 | |||
71 | An exofs lives inside a single OSD partition. You can create multiple exofs | ||
72 | filesystems on the same device using multiple pids. | ||
73 | |||
74 | (run mkfs.exofs without any parameters for usage help message) | ||
75 | |||
76 | 6. Mount the file system. | ||
77 | |||
78 | For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs: | ||
79 | |||
80 | mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/ | ||
81 | |||
82 | 7. For reference (See do-exofs example script): | ||
83 | do-exofs start - an example of how to perform the above steps. | ||
84 | do-exofs stop - an example of how to unmount the file system. | ||
85 | do-exofs format - an example of how to format and mkfs a new exofs. | ||
86 | |||
87 | 8. Extra compilation flags (uncomment in fs/exofs/Kbuild): | ||
88 | CONFIG_EXOFS_DEBUG - for debug messages and extra checks. | ||
89 | |||
90 | =============================================================================== | ||
91 | exofs mount options | ||
92 | =============================================================================== | ||
93 | Similar to any mount command: | ||
94 | mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory | ||
95 | |||
96 | Where: | ||
97 | -t exofs: specifies the exofs file system | ||
98 | |||
99 | /dev/osdX: X is a decimal number. /dev/osdX was created after a successful | ||
100 | login into an OSD target. | ||
101 | |||
102 | mount_exofs_directory: The directory to mount the file system on | ||
103 | |||
104 | exofs specific options: Options are separated by commas (,) | ||
105 | pid=<integer> - The partition number to mount/create as | ||
106 | container of the filesystem. | ||
107 | This option is mandatory | ||
108 | to=<integer> - Timeout in ticks for a single command | ||
109 | default is (60 * HZ) [for debugging only] | ||
110 | |||
111 | =============================================================================== | ||
112 | DESIGN | ||
113 | =============================================================================== | ||
114 | |||
115 | * The file system control block (AKA on-disk superblock) resides in an object | ||
116 | with a special ID (defined in common.h). | ||
117 | Information included in the file system control block is used to fill the | ||
118 | in-memory superblock structure at mount time. This object is created before | ||
119 | the file system is used by mkexofs.c It contains information such as: | ||
120 | - The file system's magic number | ||
121 | - The next inode number to be allocated | ||
122 | |||
123 | * Each file resides in its own object and contains the data (and it will be | ||
124 | possible to extend the file over multiple objects, though this has not been | ||
125 | implemented yet). | ||
126 | |||
127 | * A directory is treated as a file, and essentially contains a list of <file | ||
128 | name, inode #> pairs for files that are found in that directory. The object | ||
129 | IDs correspond to the files' inode numbers and will be allocated according to | ||
130 | a bitmap (stored in a separate object). Now they are allocated using a | ||
131 | counter. | ||
132 | |||
133 | * Each file's control block (AKA on-disk inode) is stored in its object's | ||
134 | attributes. This applies to both regular files and other types (directories, | ||
135 | device files, symlinks, etc.). | ||
136 | |||
137 | * Credentials are generated per object (inode and superblock) when they is | ||
138 | created in memory (read off disk or created). The credential works for all | ||
139 | operations and is used as long as the object remains in memory. | ||
140 | |||
141 | * Async OSD operations are used whenever possible, but the target may execute | ||
142 | them out of order. The operations that concern us are create, delete, | ||
143 | readpage, writepage, update_inode, and truncate. The following pairs of | ||
144 | operations should execute in the order written, and we need to prevent them | ||
145 | from executing in reverse order: | ||
146 | - The following are handled with the OBJ_CREATED and OBJ_2BCREATED | ||
147 | flags. OBJ_CREATED is set when we know the object exists on the OSD - | ||
148 | in create's callback function, and when we successfully do a read_inode. | ||
149 | OBJ_2BCREATED is set in the beginning of the create function, so we | ||
150 | know that we should wait. | ||
151 | - create/delete: delete should wait until the object is created | ||
152 | on the OSD. | ||
153 | - create/readpage: readpage should be able to return a page | ||
154 | full of zeroes in this case. If there was a write already | ||
155 | en-route (i.e. create, writepage, readpage) then the page | ||
156 | would be locked, and so it would really be the same as | ||
157 | create/writepage. | ||
158 | - create/writepage: if writepage is called for a sync write, it | ||
159 | should wait until the object is created on the OSD. | ||
160 | Otherwise, it should just return. | ||
161 | - create/truncate: truncate should wait until the object is | ||
162 | created on the OSD. | ||
163 | - create/update_inode: update_inode should wait until the | ||
164 | object is created on the OSD. | ||
165 | - Handled by VFS locks: | ||
166 | - readpage/delete: shouldn't happen because of page lock. | ||
167 | - writepage/delete: shouldn't happen because of page lock. | ||
168 | - readpage/writepage: shouldn't happen because of page lock. | ||
169 | |||
170 | =============================================================================== | ||
171 | LICENSE/COPYRIGHT | ||
172 | =============================================================================== | ||
173 | The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel | ||
174 | version 2.6.10). All files include the original copyrights, and the license | ||
175 | is GPL version 2 (only version 2, as is true for the Linux kernel). The | ||
176 | Linux kernel can be downloaded from www.kernel.org. | ||
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt index 4333e836c495..e055acb6b2d4 100644 --- a/Documentation/filesystems/ext2.txt +++ b/Documentation/filesystems/ext2.txt | |||
@@ -373,10 +373,11 @@ Filesystem Resizing http://ext2resize.sourceforge.net/ | |||
373 | Compression (*) http://e2compr.sourceforge.net/ | 373 | Compression (*) http://e2compr.sourceforge.net/ |
374 | 374 | ||
375 | Implementations for: | 375 | Implementations for: |
376 | Windows 95/98/NT/2000 http://uranus.it.swin.edu.au/~jn/linux/Explore2fs.htm | 376 | Windows 95/98/NT/2000 http://www.chrysocome.net/explore2fs |
377 | Windows 95 (*) http://www.yipton.demon.co.uk/content.html#FSDEXT2 | 377 | Windows 95 (*) http://www.yipton.net/content.html#FSDEXT2 |
378 | DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/ | 378 | DOS client (*) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/ |
379 | OS/2 http://perso.wanadoo.fr/matthieu.willm/ext2-os2/ | 379 | OS/2 (+) ftp://metalab.unc.edu/pub/Linux/system/filesystems/ext2/ |
380 | RISC OS client ftp://ftp.barnet.ac.uk/pub/acorn/armlinux/iscafs/ | 380 | RISC OS client http://www.esw-heim.tu-clausthal.de/~marco/smorbrod/IscaFS/ |
381 | 381 | ||
382 | (*) no longer actively developed/supported (as of Apr 2001) | 382 | (*) no longer actively developed/supported (as of Apr 2001) |
383 | (+) no longer actively developed/supported (as of Mar 2009) | ||
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 9dd2a3bb2acc..570f9bd9be2b 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt | |||
@@ -14,6 +14,11 @@ Options | |||
14 | When mounting an ext3 filesystem, the following option are accepted: | 14 | When mounting an ext3 filesystem, the following option are accepted: |
15 | (*) == default | 15 | (*) == default |
16 | 16 | ||
17 | ro Mount filesystem read only. Note that ext3 will replay | ||
18 | the journal (and thus write to the partition) even when | ||
19 | mounted "read only". Mount options "ro,noload" can be | ||
20 | used to prevent writes to the filesystem. | ||
21 | |||
17 | journal=update Update the ext3 file system's journal to the current | 22 | journal=update Update the ext3 file system's journal to the current |
18 | format. | 23 | format. |
19 | 24 | ||
@@ -27,7 +32,9 @@ journal_dev=devnum When the external journal device's major/minor numbers | |||
27 | identified through its new major/minor numbers encoded | 32 | identified through its new major/minor numbers encoded |
28 | in devnum. | 33 | in devnum. |
29 | 34 | ||
30 | noload Don't load the journal on mounting. | 35 | noload Don't load the journal on mounting. Note that this forces |
36 | mount of inconsistent filesystem, which can lead to | ||
37 | various problems. | ||
31 | 38 | ||
32 | data=journal All data are committed into the journal prior to being | 39 | data=journal All data are committed into the journal prior to being |
33 | written into the main file system. | 40 | written into the main file system. |
@@ -92,9 +99,12 @@ nocheck | |||
92 | 99 | ||
93 | debug Extra debugging information is sent to syslog. | 100 | debug Extra debugging information is sent to syslog. |
94 | 101 | ||
95 | errors=remount-ro(*) Remount the filesystem read-only on an error. | 102 | errors=remount-ro Remount the filesystem read-only on an error. |
96 | errors=continue Keep going on a filesystem error. | 103 | errors=continue Keep going on a filesystem error. |
97 | errors=panic Panic and halt the machine if an error occurs. | 104 | errors=panic Panic and halt the machine if an error occurs. |
105 | (These mount options override the errors behavior | ||
106 | specified in the superblock, which can be | ||
107 | configured using tune2fs.) | ||
98 | 108 | ||
99 | data_err=ignore(*) Just print an error message if an error occurs | 109 | data_err=ignore(*) Just print an error message if an error occurs |
100 | in a file data buffer in ordered mode. | 110 | in a file data buffer in ordered mode. |
@@ -198,5 +208,5 @@ kernel source: <file:fs/ext3/> | |||
198 | programs: http://e2fsprogs.sourceforge.net/ | 208 | programs: http://e2fsprogs.sourceforge.net/ |
199 | http://ext2resize.sourceforge.net | 209 | http://ext2resize.sourceforge.net |
200 | 210 | ||
201 | useful links: http://www-106.ibm.com/developerworks/linux/library/l-fs7/ | 211 | useful links: http://www.ibm.com/developerworks/library/l-fs7.html |
202 | http://www-106.ibm.com/developerworks/linux/library/l-fs8/ | 212 | http://www.ibm.com/developerworks/library/l-fs8.html |
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index cec829bc7291..97882df04865 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be | |||
85 | * extent format more robust in face of on-disk corruption due to magics, | 85 | * extent format more robust in face of on-disk corruption due to magics, |
86 | * internal redundancy in tree | 86 | * internal redundancy in tree |
87 | * improved file allocation (multi-block alloc) | 87 | * improved file allocation (multi-block alloc) |
88 | * fix 32000 subdirectory limit | 88 | * lift 32000 subdirectory limit imposed by i_links_count[1] |
89 | * nsec timestamps for mtime, atime, ctime, create time | 89 | * nsec timestamps for mtime, atime, ctime, create time |
90 | * inode version field on disk (NFSv4, Lustre) | 90 | * inode version field on disk (NFSv4, Lustre) |
91 | * reduced e2fsck time via uninit_bg feature | 91 | * reduced e2fsck time via uninit_bg feature |
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be | |||
100 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | 100 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force |
101 | the ordering) | 101 | the ordering) |
102 | 102 | ||
103 | [1] Filesystems with a block size of 1k may see a limit imposed by the | ||
104 | directory hash tree having a maximum depth of two. | ||
105 | |||
103 | 2.2 Candidate features for future inclusion | 106 | 2.2 Candidate features for future inclusion |
104 | 107 | ||
105 | * Online defrag (patches available but not well tested) | 108 | * Online defrag (patches available but not well tested) |
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata | |||
180 | performance. | 183 | performance. |
181 | 184 | ||
182 | barrier=<0|1(*)> This enables/disables the use of write barriers in | 185 | barrier=<0|1(*)> This enables/disables the use of write barriers in |
183 | the jbd code. barrier=0 disables, barrier=1 enables. | 186 | barrier(*) the jbd code. barrier=0 disables, barrier=1 enables. |
184 | This also requires an IO stack which can support | 187 | nobarrier This also requires an IO stack which can support |
185 | barriers, and if jbd gets an error on a barrier | 188 | barriers, and if jbd gets an error on a barrier |
186 | write, it will disable again with a warning. | 189 | write, it will disable again with a warning. |
187 | Write barriers enforce proper on-disk ordering | 190 | Write barriers enforce proper on-disk ordering |
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in | |||
189 | safe to use, at some performance penalty. If | 192 | safe to use, at some performance penalty. If |
190 | your disks are battery-backed in one way or another, | 193 | your disks are battery-backed in one way or another, |
191 | disabling barriers may safely improve performance. | 194 | disabling barriers may safely improve performance. |
195 | The mount options "barrier" and "nobarrier" can | ||
196 | also be used to enable or disable barriers, for | ||
197 | consistency with other ext4 mount options. | ||
192 | 198 | ||
193 | inode_readahead=n This tuning parameter controls the maximum | 199 | inode_readahead=n This tuning parameter controls the maximum |
194 | number of inode table blocks that ext4's inode | 200 | number of inode table blocks that ext4's inode |
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the | |||
310 | a slightly higher priority than the default I/O | 316 | a slightly higher priority than the default I/O |
311 | priority. | 317 | priority. |
312 | 318 | ||
319 | auto_da_alloc(*) Many broken applications don't use fsync() when | ||
320 | noauto_da_alloc replacing existing files via patterns such as | ||
321 | fd = open("foo.new")/write(fd,..)/close(fd)/ | ||
322 | rename("foo.new", "foo"), or worse yet, | ||
323 | fd = open("foo", O_TRUNC)/write(fd,..)/close(fd). | ||
324 | If auto_da_alloc is enabled, ext4 will detect | ||
325 | the replace-via-rename and replace-via-truncate | ||
326 | patterns and force that any delayed allocation | ||
327 | blocks are allocated such that at the next | ||
328 | journal commit, in the default data=ordered | ||
329 | mode, the data blocks of the new file are forced | ||
330 | to disk before the rename() operation is | ||
331 | commited. This provides roughly the same level | ||
332 | of guarantees as ext3, and avoids the | ||
333 | "zero-length" problem that can happen when a | ||
334 | system crashes before the delayed allocation | ||
335 | blocks are forced to disk. | ||
336 | |||
313 | Data Mode | 337 | Data Mode |
314 | ========= | 338 | ========= |
315 | There are 3 different data modes: | 339 | There are 3 different data modes: |
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/knfsd-stats.txt new file mode 100644 index 000000000000..64ced5149d37 --- /dev/null +++ b/Documentation/filesystems/knfsd-stats.txt | |||
@@ -0,0 +1,159 @@ | |||
1 | |||
2 | Kernel NFS Server Statistics | ||
3 | ============================ | ||
4 | |||
5 | This document describes the format and semantics of the statistics | ||
6 | which the kernel NFS server makes available to userspace. These | ||
7 | statistics are available in several text form pseudo files, each of | ||
8 | which is described separately below. | ||
9 | |||
10 | In most cases you don't need to know these formats, as the nfsstat(8) | ||
11 | program from the nfs-utils distribution provides a helpful command-line | ||
12 | interface for extracting and printing them. | ||
13 | |||
14 | All the files described here are formatted as a sequence of text lines, | ||
15 | separated by newline '\n' characters. Lines beginning with a hash | ||
16 | '#' character are comments intended for humans and should be ignored | ||
17 | by parsing routines. All other lines contain a sequence of fields | ||
18 | separated by whitespace. | ||
19 | |||
20 | /proc/fs/nfsd/pool_stats | ||
21 | ------------------------ | ||
22 | |||
23 | This file is available in kernels from 2.6.30 onwards, if the | ||
24 | /proc/fs/nfsd filesystem is mounted (it almost always should be). | ||
25 | |||
26 | The first line is a comment which describes the fields present in | ||
27 | all the other lines. The other lines present the following data as | ||
28 | a sequence of unsigned decimal numeric fields. One line is shown | ||
29 | for each NFS thread pool. | ||
30 | |||
31 | All counters are 64 bits wide and wrap naturally. There is no way | ||
32 | to zero these counters, instead applications should do their own | ||
33 | rate conversion. | ||
34 | |||
35 | pool | ||
36 | The id number of the NFS thread pool to which this line applies. | ||
37 | This number does not change. | ||
38 | |||
39 | Thread pool ids are a contiguous set of small integers starting | ||
40 | at zero. The maximum value depends on the thread pool mode, but | ||
41 | currently cannot be larger than the number of CPUs in the system. | ||
42 | Note that in the default case there will be a single thread pool | ||
43 | which contains all the nfsd threads and all the CPUs in the system, | ||
44 | and thus this file will have a single line with a pool id of "0". | ||
45 | |||
46 | packets-arrived | ||
47 | Counts how many NFS packets have arrived. More precisely, this | ||
48 | is the number of times that the network stack has notified the | ||
49 | sunrpc server layer that new data may be available on a transport | ||
50 | (e.g. an NFS or UDP socket or an NFS/RDMA endpoint). | ||
51 | |||
52 | Depending on the NFS workload patterns and various network stack | ||
53 | effects (such as Large Receive Offload) which can combine packets | ||
54 | on the wire, this may be either more or less than the number | ||
55 | of NFS calls received (which statistic is available elsewhere). | ||
56 | However this is a more accurate and less workload-dependent measure | ||
57 | of how much CPU load is being placed on the sunrpc server layer | ||
58 | due to NFS network traffic. | ||
59 | |||
60 | sockets-enqueued | ||
61 | Counts how many times an NFS transport is enqueued to wait for | ||
62 | an nfsd thread to service it, i.e. no nfsd thread was considered | ||
63 | available. | ||
64 | |||
65 | The circumstance this statistic tracks indicates that there was NFS | ||
66 | network-facing work to be done but it couldn't be done immediately, | ||
67 | thus introducing a small delay in servicing NFS calls. The ideal | ||
68 | rate of change for this counter is zero; significantly non-zero | ||
69 | values may indicate a performance limitation. | ||
70 | |||
71 | This can happen either because there are too few nfsd threads in the | ||
72 | thread pool for the NFS workload (the workload is thread-limited), | ||
73 | or because the NFS workload needs more CPU time than is available in | ||
74 | the thread pool (the workload is CPU-limited). In the former case, | ||
75 | configuring more nfsd threads will probably improve the performance | ||
76 | of the NFS workload. In the latter case, the sunrpc server layer is | ||
77 | already choosing not to wake idle nfsd threads because there are too | ||
78 | many nfsd threads which want to run but cannot, so configuring more | ||
79 | nfsd threads will make no difference whatsoever. The overloads-avoided | ||
80 | statistic (see below) can be used to distinguish these cases. | ||
81 | |||
82 | threads-woken | ||
83 | Counts how many times an idle nfsd thread is woken to try to | ||
84 | receive some data from an NFS transport. | ||
85 | |||
86 | This statistic tracks the circumstance where incoming | ||
87 | network-facing NFS work is being handled quickly, which is a good | ||
88 | thing. The ideal rate of change for this counter will be close | ||
89 | to but less than the rate of change of the packets-arrived counter. | ||
90 | |||
91 | overloads-avoided | ||
92 | Counts how many times the sunrpc server layer chose not to wake an | ||
93 | nfsd thread, despite the presence of idle nfsd threads, because | ||
94 | too many nfsd threads had been recently woken but could not get | ||
95 | enough CPU time to actually run. | ||
96 | |||
97 | This statistic counts a circumstance where the sunrpc layer | ||
98 | heuristically avoids overloading the CPU scheduler with too many | ||
99 | runnable nfsd threads. The ideal rate of change for this counter | ||
100 | is zero. Significant non-zero values indicate that the workload | ||
101 | is CPU limited. Usually this is associated with heavy CPU usage | ||
102 | on all the CPUs in the nfsd thread pool. | ||
103 | |||
104 | If a sustained large overloads-avoided rate is detected on a pool, | ||
105 | the top(1) utility should be used to check for the following | ||
106 | pattern of CPU usage on all the CPUs associated with the given | ||
107 | nfsd thread pool. | ||
108 | |||
109 | - %us ~= 0 (as you're *NOT* running applications on your NFS server) | ||
110 | |||
111 | - %wa ~= 0 | ||
112 | |||
113 | - %id ~= 0 | ||
114 | |||
115 | - %sy + %hi + %si ~= 100 | ||
116 | |||
117 | If this pattern is seen, configuring more nfsd threads will *not* | ||
118 | improve the performance of the workload. If this patten is not | ||
119 | seen, then something more subtle is wrong. | ||
120 | |||
121 | threads-timedout | ||
122 | Counts how many times an nfsd thread triggered an idle timeout, | ||
123 | i.e. was not woken to handle any incoming network packets for | ||
124 | some time. | ||
125 | |||
126 | This statistic counts a circumstance where there are more nfsd | ||
127 | threads configured than can be used by the NFS workload. This is | ||
128 | a clue that the number of nfsd threads can be reduced without | ||
129 | affecting performance. Unfortunately, it's only a clue and not | ||
130 | a strong indication, for a couple of reasons: | ||
131 | |||
132 | - Currently the rate at which the counter is incremented is quite | ||
133 | slow; the idle timeout is 60 minutes. Unless the NFS workload | ||
134 | remains constant for hours at a time, this counter is unlikely | ||
135 | to be providing information that is still useful. | ||
136 | |||
137 | - It is usually a wise policy to provide some slack, | ||
138 | i.e. configure a few more nfsds than are currently needed, | ||
139 | to allow for future spikes in load. | ||
140 | |||
141 | |||
142 | Note that incoming packets on NFS transports will be dealt with in | ||
143 | one of three ways. An nfsd thread can be woken (threads-woken counts | ||
144 | this case), or the transport can be enqueued for later attention | ||
145 | (sockets-enqueued counts this case), or the packet can be temporarily | ||
146 | deferred because the transport is currently being used by an nfsd | ||
147 | thread. This last case is not very interesting and is not explicitly | ||
148 | counted, but can be inferred from the other counters thus: | ||
149 | |||
150 | packets-deferred = packets-arrived - ( sockets-enqueued + threads-woken ) | ||
151 | |||
152 | |||
153 | More | ||
154 | ---- | ||
155 | Descriptions of the other statistics file should go here. | ||
156 | |||
157 | |||
158 | Greg Banks <gnb@sgi.com> | ||
159 | 26 Mar 2009 | ||
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs41-server.txt new file mode 100644 index 000000000000..05d81cbcb2e1 --- /dev/null +++ b/Documentation/filesystems/nfs41-server.txt | |||
@@ -0,0 +1,161 @@ | |||
1 | NFSv4.1 Server Implementation | ||
2 | |||
3 | Server support for minorversion 1 can be controlled using the | ||
4 | /proc/fs/nfsd/versions control file. The string output returned | ||
5 | by reading this file will contain either "+4.1" or "-4.1" | ||
6 | correspondingly. | ||
7 | |||
8 | Currently, server support for minorversion 1 is disabled by default. | ||
9 | It can be enabled at run time by writing the string "+4.1" to | ||
10 | the /proc/fs/nfsd/versions control file. Note that to write this | ||
11 | control file, the nfsd service must be taken down. Use your user-mode | ||
12 | nfs-utils to set this up; see rpc.nfsd(8) | ||
13 | |||
14 | The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based | ||
15 | on the latest NFSv4.1 Internet Draft: | ||
16 | http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 | ||
17 | |||
18 | From the many new features in NFSv4.1 the current implementation | ||
19 | focuses on the mandatory-to-implement NFSv4.1 Sessions, providing | ||
20 | "exactly once" semantics and better control and throttling of the | ||
21 | resources allocated for each client. | ||
22 | |||
23 | Other NFSv4.1 features, Parallel NFS operations in particular, | ||
24 | are still under development out of tree. | ||
25 | See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design | ||
26 | for more information. | ||
27 | |||
28 | The table below, taken from the NFSv4.1 document, lists | ||
29 | the operations that are mandatory to implement (REQ), optional | ||
30 | (OPT), and NFSv4.0 operations that are required not to implement (MNI) | ||
31 | in minor version 1. The first column indicates the operations that | ||
32 | are not supported yet by the linux server implementation. | ||
33 | |||
34 | The OPTIONAL features identified and their abbreviations are as follows: | ||
35 | pNFS Parallel NFS | ||
36 | FDELG File Delegations | ||
37 | DDELG Directory Delegations | ||
38 | |||
39 | The following abbreviations indicate the linux server implementation status. | ||
40 | I Implemented NFSv4.1 operations. | ||
41 | NS Not Supported. | ||
42 | NS* unimplemented optional feature. | ||
43 | P pNFS features implemented out of tree. | ||
44 | PNS pNFS features that are not supported yet (out of tree). | ||
45 | |||
46 | Operations | ||
47 | |||
48 | +----------------------+------------+--------------+----------------+ | ||
49 | | Operation | REQ, REC, | Feature | Definition | | ||
50 | | | OPT, or | (REQ, REC, | | | ||
51 | | | MNI | or OPT) | | | ||
52 | +----------------------+------------+--------------+----------------+ | ||
53 | | ACCESS | REQ | | Section 18.1 | | ||
54 | NS | BACKCHANNEL_CTL | REQ | | Section 18.33 | | ||
55 | NS | BIND_CONN_TO_SESSION | REQ | | Section 18.34 | | ||
56 | | CLOSE | REQ | | Section 18.2 | | ||
57 | | COMMIT | REQ | | Section 18.3 | | ||
58 | | CREATE | REQ | | Section 18.4 | | ||
59 | I | CREATE_SESSION | REQ | | Section 18.36 | | ||
60 | NS*| DELEGPURGE | OPT | FDELG (REQ) | Section 18.5 | | ||
61 | | DELEGRETURN | OPT | FDELG, | Section 18.6 | | ||
62 | | | | DDELG, pNFS | | | ||
63 | | | | (REQ) | | | ||
64 | NS | DESTROY_CLIENTID | REQ | | Section 18.50 | | ||
65 | I | DESTROY_SESSION | REQ | | Section 18.37 | | ||
66 | I | EXCHANGE_ID | REQ | | Section 18.35 | | ||
67 | NS | FREE_STATEID | REQ | | Section 18.38 | | ||
68 | | GETATTR | REQ | | Section 18.7 | | ||
69 | P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | | ||
70 | P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | | ||
71 | | GETFH | REQ | | Section 18.8 | | ||
72 | NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | | ||
73 | P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | | ||
74 | P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | | ||
75 | P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | | ||
76 | | LINK | OPT | | Section 18.9 | | ||
77 | | LOCK | REQ | | Section 18.10 | | ||
78 | | LOCKT | REQ | | Section 18.11 | | ||
79 | | LOCKU | REQ | | Section 18.12 | | ||
80 | | LOOKUP | REQ | | Section 18.13 | | ||
81 | | LOOKUPP | REQ | | Section 18.14 | | ||
82 | | NVERIFY | REQ | | Section 18.15 | | ||
83 | | OPEN | REQ | | Section 18.16 | | ||
84 | NS*| OPENATTR | OPT | | Section 18.17 | | ||
85 | | OPEN_CONFIRM | MNI | | N/A | | ||
86 | | OPEN_DOWNGRADE | REQ | | Section 18.18 | | ||
87 | | PUTFH | REQ | | Section 18.19 | | ||
88 | | PUTPUBFH | REQ | | Section 18.20 | | ||
89 | | PUTROOTFH | REQ | | Section 18.21 | | ||
90 | | READ | REQ | | Section 18.22 | | ||
91 | | READDIR | REQ | | Section 18.23 | | ||
92 | | READLINK | OPT | | Section 18.24 | | ||
93 | NS | RECLAIM_COMPLETE | REQ | | Section 18.51 | | ||
94 | | RELEASE_LOCKOWNER | MNI | | N/A | | ||
95 | | REMOVE | REQ | | Section 18.25 | | ||
96 | | RENAME | REQ | | Section 18.26 | | ||
97 | | RENEW | MNI | | N/A | | ||
98 | | RESTOREFH | REQ | | Section 18.27 | | ||
99 | | SAVEFH | REQ | | Section 18.28 | | ||
100 | | SECINFO | REQ | | Section 18.29 | | ||
101 | NS | SECINFO_NO_NAME | REC | pNFS files | Section 18.45, | | ||
102 | | | | layout (REQ) | Section 13.12 | | ||
103 | I | SEQUENCE | REQ | | Section 18.46 | | ||
104 | | SETATTR | REQ | | Section 18.30 | | ||
105 | | SETCLIENTID | MNI | | N/A | | ||
106 | | SETCLIENTID_CONFIRM | MNI | | N/A | | ||
107 | NS | SET_SSV | REQ | | Section 18.47 | | ||
108 | NS | TEST_STATEID | REQ | | Section 18.48 | | ||
109 | | VERIFY | REQ | | Section 18.31 | | ||
110 | NS*| WANT_DELEGATION | OPT | FDELG (OPT) | Section 18.49 | | ||
111 | | WRITE | REQ | | Section 18.32 | | ||
112 | |||
113 | Callback Operations | ||
114 | |||
115 | +-------------------------+-----------+-------------+---------------+ | ||
116 | | Operation | REQ, REC, | Feature | Definition | | ||
117 | | | OPT, or | (REQ, REC, | | | ||
118 | | | MNI | or OPT) | | | ||
119 | +-------------------------+-----------+-------------+---------------+ | ||
120 | | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | | ||
121 | P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | | ||
122 | NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | | ||
123 | P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | | ||
124 | NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | | ||
125 | NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | | ||
126 | | CB_RECALL | OPT | FDELG, | Section 20.2 | | ||
127 | | | | DDELG, pNFS | | | ||
128 | | | | (REQ) | | | ||
129 | NS*| CB_RECALL_ANY | OPT | FDELG, | Section 20.6 | | ||
130 | | | | DDELG, pNFS | | | ||
131 | | | | (REQ) | | | ||
132 | NS | CB_RECALL_SLOT | REQ | | Section 20.8 | | ||
133 | NS*| CB_RECALLABLE_OBJ_AVAIL | OPT | DDELG, pNFS | Section 20.7 | | ||
134 | | | | (REQ) | | | ||
135 | I | CB_SEQUENCE | OPT | FDELG, | Section 20.9 | | ||
136 | | | | DDELG, pNFS | | | ||
137 | | | | (REQ) | | | ||
138 | NS*| CB_WANTS_CANCELLED | OPT | FDELG, | Section 20.10 | | ||
139 | | | | DDELG, pNFS | | | ||
140 | | | | (REQ) | | | ||
141 | +-------------------------+-----------+-------------+---------------+ | ||
142 | |||
143 | Implementation notes: | ||
144 | |||
145 | EXCHANGE_ID: | ||
146 | * only SP4_NONE state protection supported | ||
147 | * implementation ids are ignored | ||
148 | |||
149 | CREATE_SESSION: | ||
150 | * backchannel attributes are ignored | ||
151 | * backchannel security parameters are ignored | ||
152 | |||
153 | SEQUENCE: | ||
154 | * no support for dynamic slot table renegotiation (optional) | ||
155 | |||
156 | nfsv4.1 COMPOUND rules: | ||
157 | The following cases aren't supported yet: | ||
158 | * Enforcing of NFS4ERR_NOT_ONLY_OP for: BIND_CONN_TO_SESSION, CREATE_SESSION, | ||
159 | DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. | ||
160 | * DESTROY_SESSION MUST be the final operation in the COMPOUND request. | ||
161 | |||
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt new file mode 100644 index 000000000000..55c4300abfcb --- /dev/null +++ b/Documentation/filesystems/nilfs2.txt | |||
@@ -0,0 +1,200 @@ | |||
1 | NILFS2 | ||
2 | ------ | ||
3 | |||
4 | NILFS2 is a log-structured file system (LFS) supporting continuous | ||
5 | snapshotting. In addition to versioning capability of the entire file | ||
6 | system, users can even restore files mistakenly overwritten or | ||
7 | destroyed just a few seconds ago. Since NILFS2 can keep consistency | ||
8 | like conventional LFS, it achieves quick recovery after system | ||
9 | crashes. | ||
10 | |||
11 | NILFS2 creates a number of checkpoints every few seconds or per | ||
12 | synchronous write basis (unless there is no change). Users can select | ||
13 | significant versions among continuously created checkpoints, and can | ||
14 | change them into snapshots which will be preserved until they are | ||
15 | changed back to checkpoints. | ||
16 | |||
17 | There is no limit on the number of snapshots until the volume gets | ||
18 | full. Each snapshot is mountable as a read-only file system | ||
19 | concurrently with its writable mount, and this feature is convenient | ||
20 | for online backup. | ||
21 | |||
22 | The userland tools are included in nilfs-utils package, which is | ||
23 | available from the following download page. At least "mkfs.nilfs2", | ||
24 | "mount.nilfs2", "umount.nilfs2", and "nilfs_cleanerd" (so called | ||
25 | cleaner or garbage collector) are required. Details on the tools are | ||
26 | described in the man pages included in the package. | ||
27 | |||
28 | Project web page: http://www.nilfs.org/en/ | ||
29 | Download page: http://www.nilfs.org/en/download.html | ||
30 | Git tree web page: http://www.nilfs.org/git/ | ||
31 | NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users | ||
32 | |||
33 | Caveats | ||
34 | ======= | ||
35 | |||
36 | Features which NILFS2 does not support yet: | ||
37 | |||
38 | - atime | ||
39 | - extended attributes | ||
40 | - POSIX ACLs | ||
41 | - quotas | ||
42 | - writable snapshots | ||
43 | - remote backup (CDP) | ||
44 | - data integrity | ||
45 | - defragmentation | ||
46 | |||
47 | Mount options | ||
48 | ============= | ||
49 | |||
50 | NILFS2 supports the following mount options: | ||
51 | (*) == default | ||
52 | |||
53 | barrier=on(*) This enables/disables barriers. barrier=off disables | ||
54 | it, barrier=on enables it. | ||
55 | errors=continue(*) Keep going on a filesystem error. | ||
56 | errors=remount-ro Remount the filesystem read-only on an error. | ||
57 | errors=panic Panic and halt the machine if an error occurs. | ||
58 | cp=n Specify the checkpoint-number of the snapshot to be | ||
59 | mounted. Checkpoints and snapshots are listed by lscp | ||
60 | user command. Only the checkpoints marked as snapshot | ||
61 | are mountable with this option. Snapshot is read-only, | ||
62 | so a read-only mount option must be specified together. | ||
63 | order=relaxed(*) Apply relaxed order semantics that allows modified data | ||
64 | blocks to be written to disk without making a | ||
65 | checkpoint if no metadata update is going. This mode | ||
66 | is equivalent to the ordered data mode of the ext3 | ||
67 | filesystem except for the updates on data blocks still | ||
68 | conserve atomicity. This will improve synchronous | ||
69 | write performance for overwriting. | ||
70 | order=strict Apply strict in-order semantics that preserves sequence | ||
71 | of all file operations including overwriting of data | ||
72 | blocks. That means, it is guaranteed that no | ||
73 | overtaking of events occurs in the recovered file | ||
74 | system after a crash. | ||
75 | |||
76 | NILFS2 usage | ||
77 | ============ | ||
78 | |||
79 | To use nilfs2 as a local file system, simply: | ||
80 | |||
81 | # mkfs -t nilfs2 /dev/block_device | ||
82 | # mount -t nilfs2 /dev/block_device /dir | ||
83 | |||
84 | This will also invoke the cleaner through the mount helper program | ||
85 | (mount.nilfs2). | ||
86 | |||
87 | Checkpoints and snapshots are managed by the following commands. | ||
88 | Their manpages are included in the nilfs-utils package above. | ||
89 | |||
90 | lscp list checkpoints or snapshots. | ||
91 | mkcp make a checkpoint or a snapshot. | ||
92 | chcp change an existing checkpoint to a snapshot or vice versa. | ||
93 | rmcp invalidate specified checkpoint(s). | ||
94 | |||
95 | To mount a snapshot, | ||
96 | |||
97 | # mount -t nilfs2 -r -o cp=<cno> /dev/block_device /snap_dir | ||
98 | |||
99 | where <cno> is the checkpoint number of the snapshot. | ||
100 | |||
101 | To unmount the NILFS2 mount point or snapshot, simply: | ||
102 | |||
103 | # umount /dir | ||
104 | |||
105 | Then, the cleaner daemon is automatically shut down by the umount | ||
106 | helper program (umount.nilfs2). | ||
107 | |||
108 | Disk format | ||
109 | =========== | ||
110 | |||
111 | A nilfs2 volume is equally divided into a number of segments except | ||
112 | for the super block (SB) and segment #0. A segment is the container | ||
113 | of logs. Each log is composed of summary information blocks, payload | ||
114 | blocks, and an optional super root block (SR): | ||
115 | |||
116 | ______________________________________________________ | ||
117 | | |SB| | Segment | Segment | Segment | ... | Segment | | | ||
118 | |_|__|_|____0____|____1____|____2____|_____|____N____|_| | ||
119 | 0 +1K +4K +8M +16M +24M +(8MB x N) | ||
120 | . . (Typical offsets for 4KB-block) | ||
121 | . . | ||
122 | .______________________. | ||
123 | | log | log |... | log | | ||
124 | |__1__|__2__|____|__m__| | ||
125 | . . | ||
126 | . . | ||
127 | . . | ||
128 | .______________________________. | ||
129 | | Summary | Payload blocks |SR| | ||
130 | |_blocks__|_________________|__| | ||
131 | |||
132 | The payload blocks are organized per file, and each file consists of | ||
133 | data blocks and B-tree node blocks: | ||
134 | |||
135 | |<--- File-A --->|<--- File-B --->| | ||
136 | _______________________________________________________________ | ||
137 | | Data blocks | B-tree blocks | Data blocks | B-tree blocks | ... | ||
138 | _|_____________|_______________|_____________|_______________|_ | ||
139 | |||
140 | |||
141 | Since only the modified blocks are written in the log, it may have | ||
142 | files without data blocks or B-tree node blocks. | ||
143 | |||
144 | The organization of the blocks is recorded in the summary information | ||
145 | blocks, which contains a header structure (nilfs_segment_summary), per | ||
146 | file structures (nilfs_finfo), and per block structures (nilfs_binfo): | ||
147 | |||
148 | _________________________________________________________________________ | ||
149 | | Summary | finfo | binfo | ... | binfo | finfo | binfo | ... | binfo |... | ||
150 | |_blocks__|___A___|_(A,1)_|_____|(A,Na)_|___B___|_(B,1)_|_____|(B,Nb)_|___ | ||
151 | |||
152 | |||
153 | The logs include regular files, directory files, symbolic link files | ||
154 | and several meta data files. The mata data files are the files used | ||
155 | to maintain file system meta data. The current version of NILFS2 uses | ||
156 | the following meta data files: | ||
157 | |||
158 | 1) Inode file (ifile) -- Stores on-disk inodes | ||
159 | 2) Checkpoint file (cpfile) -- Stores checkpoints | ||
160 | 3) Segment usage file (sufile) -- Stores allocation state of segments | ||
161 | 4) Data address translation file -- Maps virtual block numbers to usual | ||
162 | (DAT) block numbers. This file serves to | ||
163 | make on-disk blocks relocatable. | ||
164 | |||
165 | The following figure shows a typical organization of the logs: | ||
166 | |||
167 | _________________________________________________________________________ | ||
168 | | Summary | regular file | file | ... | ifile | cpfile | sufile | DAT |SR| | ||
169 | |_blocks__|_or_directory_|_______|_____|_______|________|________|_____|__| | ||
170 | |||
171 | |||
172 | To stride over segment boundaries, this sequence of files may be split | ||
173 | into multiple logs. The sequence of logs that should be treated as | ||
174 | logically one log, is delimited with flags marked in the segment | ||
175 | summary. The recovery code of nilfs2 looks this boundary information | ||
176 | to ensure atomicity of updates. | ||
177 | |||
178 | The super root block is inserted for every checkpoints. It includes | ||
179 | three special inodes, inodes for the DAT, cpfile, and sufile. Inodes | ||
180 | of regular files, directories, symlinks and other special files, are | ||
181 | included in the ifile. The inode of ifile itself is included in the | ||
182 | corresponding checkpoint entry in the cpfile. Thus, the hierarchy | ||
183 | among NILFS2 files can be depicted as follows: | ||
184 | |||
185 | Super block (SB) | ||
186 | | | ||
187 | v | ||
188 | Super root block (the latest cno=xx) | ||
189 | |-- DAT | ||
190 | |-- sufile | ||
191 | `-- cpfile | ||
192 | |-- ifile (cno=c1) | ||
193 | |-- ifile (cno=c2) ---- file (ino=i1) | ||
194 | : : |-- file (ino=i2) | ||
195 | `-- ifile (cno=xx) |-- file (ino=i3) | ||
196 | : : | ||
197 | `-- file (ino=yy) | ||
198 | ( regular file, directory, or symlink ) | ||
199 | |||
200 | For detail on the format of each file, please see include/linux/nilfs2_fs.h. | ||
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt new file mode 100644 index 000000000000..dcf833587162 --- /dev/null +++ b/Documentation/filesystems/pohmelfs/design_notes.txt | |||
@@ -0,0 +1,71 @@ | |||
1 | POHMELFS: Parallel Optimized Host Message Exchange Layered File System. | ||
2 | |||
3 | Evgeniy Polyakov <zbr@ioremap.net> | ||
4 | |||
5 | Homepage: http://www.ioremap.net/projects/pohmelfs | ||
6 | |||
7 | POHMELFS first began as a network filesystem with coherent local data and | ||
8 | metadata caches but is now evolving into a parallel distributed filesystem. | ||
9 | |||
10 | Main features of this FS include: | ||
11 | * Locally coherent cache for data and metadata with (potentially) byte-range locks. | ||
12 | Since all Linux filesystems lock the whole inode during writing, algorithm | ||
13 | is very simple and does not use byte-ranges, although they are sent in | ||
14 | locking messages. | ||
15 | * Completely async processing of all events except creation of hard and symbolic | ||
16 | links, and rename events. | ||
17 | Object creation and data reading and writing are processed asynchronously. | ||
18 | * Flexible object architecture optimized for network processing. | ||
19 | Ability to create long paths to objects and remove arbitrarily huge | ||
20 | directories with a single network command. | ||
21 | (like removing the whole kernel tree via a single network command). | ||
22 | * Very high performance. | ||
23 | * Fast and scalable multithreaded userspace server. Being in userspace it works | ||
24 | with any underlying filesystem and still is much faster than async in-kernel NFS one. | ||
25 | * Client is able to switch between different servers (if one goes down, client | ||
26 | automatically reconnects to second and so on). | ||
27 | * Transactions support. Full failover for all operations. | ||
28 | Resending transactions to different servers on timeout or error. | ||
29 | * Read request (data read, directory listing, lookup requests) balancing between multiple servers. | ||
30 | * Write requests are replicated to multiple servers and completed only when all of them are acked. | ||
31 | * Ability to add and/or remove servers from the working set at run-time. | ||
32 | * Strong authentification and possible data encryption in network channel. | ||
33 | * Extended attributes support. | ||
34 | |||
35 | POHMELFS is based on transactions, which are potentially long-standing objects that live | ||
36 | in the client's memory. Each transaction contains all the information needed to process a given | ||
37 | command (or set of commands, which is frequently used during data writing: single transactions | ||
38 | can contain creation and data writing commands). Transactions are committed by all the servers | ||
39 | to which they are sent and, in case of failures, are eventually resent or dropped with an error. | ||
40 | For example, reading will return an error if no servers are available. | ||
41 | |||
42 | POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is | ||
43 | possible to detach replies from requests and, if the command requires data to be received, the | ||
44 | caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different | ||
45 | servers and async threads will pick up replies in parallel, find appropriate transactions in the | ||
46 | system and put the data where it belongs (like the page or inode cache). | ||
47 | |||
48 | The main feature of POHMELFS is writeback data and the metadata cache. | ||
49 | Only a few non-performance critical operations use the write-through cache and | ||
50 | are synchronous: hard and symbolic link creation, and object rename. Creation, | ||
51 | removal of objects and data writing are asynchronous and are sent to | ||
52 | the server during system writeback. Only one writer at a time is allowed for any | ||
53 | given inode, which is guarded by an appropriate locking protocol. | ||
54 | Because of this feature, POHMELFS is extremely fast at metadata intensive | ||
55 | workloads and can fully utilize the bandwidth to the servers when doing bulk | ||
56 | data transfers. | ||
57 | |||
58 | POHMELFS clients operate with a working set of servers and are capable of balancing read-only | ||
59 | operations (like lookups or directory listings) between them according to IO priorities. | ||
60 | Administrators can add or remove servers from the set at run-time via special commands (described | ||
61 | in Documentation/pohmelfs/info.txt file). Writes are replicated to all servers, which are connected | ||
62 | with write permission turned on. IO priority and permissions can be changed in run-time. | ||
63 | |||
64 | POHMELFS is capable of full data channel encryption and/or strong crypto hashing. | ||
65 | One can select any kernel supported cipher, encryption mode, hash type and operation mode | ||
66 | (hmac or digest). It is also possible to use both or neither (default). Crypto configuration | ||
67 | is checked during mount time and, if the server does not support it, appropriate capabilities | ||
68 | will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified). | ||
69 | Crypto performance heavily depends on the number of crypto threads, which asynchronously perform | ||
70 | crypto operations and send the resulting data to server or submit it up the stack. This number | ||
71 | can be controlled via a mount option. | ||
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt new file mode 100644 index 000000000000..db2e41393626 --- /dev/null +++ b/Documentation/filesystems/pohmelfs/info.txt | |||
@@ -0,0 +1,99 @@ | |||
1 | POHMELFS usage information. | ||
2 | |||
3 | Mount options. | ||
4 | All but index, number of crypto threads and maximum IO size can changed via remount. | ||
5 | |||
6 | idx=%u | ||
7 | Each mountpoint is associated with a special index via this option. | ||
8 | Administrator can add or remove servers from the given index, so all mounts, | ||
9 | which were attached to it, are updated. | ||
10 | Default it is 0. | ||
11 | |||
12 | trans_scan_timeout=%u | ||
13 | This timeout, expressed in milliseconds, specifies time to scan transaction | ||
14 | trees looking for stale requests, which have to be resent, or if number of | ||
15 | retries exceed specified limit, dropped with error. | ||
16 | Default is 5 seconds. | ||
17 | |||
18 | drop_scan_timeout=%u | ||
19 | Internal timeout, expressed in milliseconds, which specifies how frequently | ||
20 | inodes marked to be dropped are freed. It also specifies how frequently | ||
21 | the system checks that servers have to be added or removed from current working set. | ||
22 | Default is 1 second. | ||
23 | |||
24 | wait_on_page_timeout=%u | ||
25 | Number of milliseconds to wait for reply from remote server for data reading command. | ||
26 | If this timeout is exceeded, reading returns an error. | ||
27 | Default is 5 seconds. | ||
28 | |||
29 | trans_retries=%u | ||
30 | This is the number of times that a transaction will be resent to a server that did | ||
31 | not answer for the last @trans_scan_timeout milliseconds. | ||
32 | When the number of resends exceeds this limit, the transaction is completed with error. | ||
33 | Default is 5 resends. | ||
34 | |||
35 | crypto_thread_num=%u | ||
36 | Number of crypto processing threads. Threads are used both for RX and TX traffic. | ||
37 | Default is 2, or no threads if crypto operations are not supported. | ||
38 | |||
39 | trans_max_pages=%u | ||
40 | Maximum number of pages in a single transaction. This parameter also controls | ||
41 | the number of pages, allocated for crypto processing (each crypto thread has | ||
42 | pool of pages, the number of which is equal to 'trans_max_pages'. | ||
43 | Default is 100 pages. | ||
44 | |||
45 | crypto_fail_unsupported | ||
46 | If specified, mount will fail if the server does not support requested crypto operations. | ||
47 | By default mount will disable non-matching crypto operations. | ||
48 | |||
49 | mcache_timeout=%u | ||
50 | Maximum number of milliseconds to wait for the mcache objects to be processed. | ||
51 | Mcache includes locks (given lock should be granted by server), attributes (they should be | ||
52 | fully received in the given timeframe). | ||
53 | Default is 5 seconds. | ||
54 | |||
55 | Usage examples. | ||
56 | |||
57 | Add server server1.net:1025 into the working set with index $idx | ||
58 | with appropriate hash algorithm and key file and cipher algorithm, mode and key file: | ||
59 | $cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key | ||
60 | |||
61 | Mount filesystem with given index $idx to /mnt mountpoint. | ||
62 | Client will connect to all servers specified in the working set via previous command: | ||
63 | mount -t pohmel -o idx=$idx q /mnt | ||
64 | |||
65 | Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw): | ||
66 | $cfg A modify -a server1.net -p 1025 -i $idx -I 1 | ||
67 | |||
68 | Change IO priority to 123 (node with the highest priority gets read requests). | ||
69 | $cfg A modify -a server1.net -p 1025 -i $idx -P 123 | ||
70 | |||
71 | One can check currect status of all connections in the mountstats file: | ||
72 | # cat /proc/$PID/mountstats | ||
73 | ... | ||
74 | device none mounted on /mnt with fstype pohmel | ||
75 | idx addr(:port) socket_type protocol active priority permissions | ||
76 | 0 server1.net:1026 1 6 1 250 1 | ||
77 | 0 server2.net:1025 1 6 1 123 3 | ||
78 | |||
79 | Server installation. | ||
80 | |||
81 | Creating a server, which listens at port 1025 and 0.0.0.0 address. | ||
82 | Working root directory (note, that server chroots there, so you have to have appropriate permissions) | ||
83 | is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there | ||
84 | are appropriate key files. | ||
85 | Number of working threads is set to 10. | ||
86 | |||
87 | # ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key | ||
88 | |||
89 | -A 6 - listen on ipv6 address. Default: Disabled. | ||
90 | -r root - path to root directory. Default: /tmp. | ||
91 | -a addr - listen address. Default: 0.0.0.0. | ||
92 | -p port - listen port. Default: 1025. | ||
93 | -w workers - number of workers per connected client. Default: 1. | ||
94 | -K file - hash key size. Default: none. | ||
95 | -k file - cipher key size. Default: none. | ||
96 | -h - this help. | ||
97 | |||
98 | Number of worker threads specifies how many workers will be created for each client. | ||
99 | Bulk single-client transafers usually are better handled with smaller number (like 1-3). | ||
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt new file mode 100644 index 000000000000..40ea6c295afb --- /dev/null +++ b/Documentation/filesystems/pohmelfs/network_protocol.txt | |||
@@ -0,0 +1,227 @@ | |||
1 | POHMELFS network protocol. | ||
2 | |||
3 | Basic structure used in network communication is following command: | ||
4 | |||
5 | struct netfs_cmd | ||
6 | { | ||
7 | __u16 cmd; /* Command number */ | ||
8 | __u16 csize; /* Attached crypto information size */ | ||
9 | __u16 cpad; /* Attached padding size */ | ||
10 | __u16 ext; /* External flags */ | ||
11 | __u32 size; /* Size of the attached data */ | ||
12 | __u32 trans; /* Transaction id */ | ||
13 | __u64 id; /* Object ID to operate on. Used for feedback.*/ | ||
14 | __u64 start; /* Start of the object. */ | ||
15 | __u64 iv; /* IV sequence */ | ||
16 | __u8 data[0]; | ||
17 | }; | ||
18 | |||
19 | Commands can be embedded into transaction command (which in turn has own command), | ||
20 | so one can extend protocol as needed without breaking backward compatibility as long | ||
21 | as old commands are supported. All string lengths include tail 0 byte. | ||
22 | |||
23 | All commans are transfered over the network in big-endian. CPU endianess is used at the end peers. | ||
24 | |||
25 | @cmd - command number, which specifies command to be processed. Following | ||
26 | commands are used currently: | ||
27 | |||
28 | NETFS_READDIR = 1, /* Read directory for given inode number */ | ||
29 | NETFS_READ_PAGE, /* Read data page from the server */ | ||
30 | NETFS_WRITE_PAGE, /* Write data page to the server */ | ||
31 | NETFS_CREATE, /* Create directory entry */ | ||
32 | NETFS_REMOVE, /* Remove directory entry */ | ||
33 | NETFS_LOOKUP, /* Lookup single object */ | ||
34 | NETFS_LINK, /* Create a link */ | ||
35 | NETFS_TRANS, /* Transaction */ | ||
36 | NETFS_OPEN, /* Open intent */ | ||
37 | NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */ | ||
38 | NETFS_PAGE_CACHE, /* Page cache invalidation message */ | ||
39 | NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */ | ||
40 | NETFS_RENAME, /* Rename object */ | ||
41 | NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */ | ||
42 | NETFS_LOCK, /* Distributed lock message */ | ||
43 | NETFS_XATTR_SET, /* Set extended attribute */ | ||
44 | NETFS_XATTR_GET, /* Get extended attribute */ | ||
45 | |||
46 | @ext - external flags. Used by different commands to specify some extra arguments | ||
47 | like partial size of the embedded objects or creation flags. | ||
48 | |||
49 | @size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached, | ||
50 | but size of the requested data is incorporated here. It does not include size of the command | ||
51 | header (struct netfs_cmd) itself. | ||
52 | |||
53 | @id - id of the object this command operates on. Each command can use it for own purpose. | ||
54 | |||
55 | @start - start of the object this command operates on. Each command can use it for own purpose. | ||
56 | |||
57 | @csize, @cpad - size and padding size of the (attached if needed) crypto information. | ||
58 | |||
59 | Command specifications. | ||
60 | |||
61 | @NETFS_READDIR | ||
62 | This command is used to sync content of the remote dir to the client. | ||
63 | |||
64 | @ext - length of the path to object. | ||
65 | @size - the same. | ||
66 | @id - local inode number of the directory to read. | ||
67 | @start - zero. | ||
68 | |||
69 | |||
70 | @NETFS_READ_PAGE | ||
71 | This command is used to read data from remote server. | ||
72 | Data size does not exceed local page cache size. | ||
73 | |||
74 | @id - inode number. | ||
75 | @start - first byte offset. | ||
76 | @size - number of bytes to read plus length of the path to object. | ||
77 | @ext - object path length. | ||
78 | |||
79 | |||
80 | @NETFS_CREATE | ||
81 | Used to create object. | ||
82 | It does not require that all directories on top of the object were | ||
83 | already created, it will create them automatically. Each object has | ||
84 | associated @netfs_path_entry data structure, which contains creation | ||
85 | mode (permissions and type) and length of the name as long as name itself. | ||
86 | |||
87 | @start - 0 | ||
88 | @size - size of the all data structures needed to create a path | ||
89 | @id - local inode number | ||
90 | @ext - 0 | ||
91 | |||
92 | |||
93 | @NETFS_REMOVE | ||
94 | Used to remove object. | ||
95 | |||
96 | @ext - length of the path to object. | ||
97 | @size - the same. | ||
98 | @id - local inode number. | ||
99 | @start - zero. | ||
100 | |||
101 | |||
102 | @NETFS_LOOKUP | ||
103 | Lookup information about object on server. | ||
104 | |||
105 | @ext - length of the path to object. | ||
106 | @size - the same. | ||
107 | @id - local inode number of the directory to look object in. | ||
108 | @start - local inode number of the object to look at. | ||
109 | |||
110 | |||
111 | @NETFS_LINK | ||
112 | Create hard of symlink. | ||
113 | Command is sent as "object_path|target_path". | ||
114 | |||
115 | @size - size of the above string. | ||
116 | @id - parent local inode number. | ||
117 | @start - 1 for symlink, 0 for hardlink. | ||
118 | @ext - size of the "object_path" above. | ||
119 | |||
120 | |||
121 | @NETFS_TRANS | ||
122 | Transaction header. | ||
123 | |||
124 | @size - incorporates all embedded command sizes including theirs header sizes. | ||
125 | @start - transaction generation number - unique id used to find transaction. | ||
126 | @ext - transaction flags. Unused at the moment. | ||
127 | @id - 0. | ||
128 | |||
129 | |||
130 | @NETFS_OPEN | ||
131 | Open intent for given transaction. | ||
132 | |||
133 | @id - local inode number. | ||
134 | @start - 0. | ||
135 | @size - path length to the object. | ||
136 | @ext - open flags (O_RDWR and so on). | ||
137 | |||
138 | |||
139 | @NETFS_INODE_INFO | ||
140 | Metadata update command. | ||
141 | It is sent to servers when attributes of the object are changed and received | ||
142 | when data or metadata were updated. It operates with the following structure: | ||
143 | |||
144 | struct netfs_inode_info | ||
145 | { | ||
146 | unsigned int mode; | ||
147 | unsigned int nlink; | ||
148 | unsigned int uid; | ||
149 | unsigned int gid; | ||
150 | unsigned int blocksize; | ||
151 | unsigned int padding; | ||
152 | __u64 ino; | ||
153 | __u64 blocks; | ||
154 | __u64 rdev; | ||
155 | __u64 size; | ||
156 | __u64 version; | ||
157 | }; | ||
158 | |||
159 | It effectively mirrors stat(2) returned data. | ||
160 | |||
161 | |||
162 | @ext - path length to the object. | ||
163 | @size - the same plus size of the netfs_inode_info structure. | ||
164 | @id - local inode number. | ||
165 | @start - 0. | ||
166 | |||
167 | |||
168 | @NETFS_PAGE_CACHE | ||
169 | Command is only received by clients. It contains information about | ||
170 | page to be marked as not up-to-date. | ||
171 | |||
172 | @id - client's inode number. | ||
173 | @start - last byte of the page to be invalidated. If it is not equal to | ||
174 | current inode size, it will be vmtruncated(). | ||
175 | @size - 0 | ||
176 | @ext - 0 | ||
177 | |||
178 | |||
179 | @NETFS_READ_PAGES | ||
180 | Used to read multiple contiguous pages in one go. | ||
181 | |||
182 | @start - first byte of the contiguous region to read. | ||
183 | @size - contains of two fields: lower 8 bits are used to represent page cache shift | ||
184 | used by client, another 3 bytes are used to get number of pages. | ||
185 | @id - local inode number. | ||
186 | @ext - path length to the object. | ||
187 | |||
188 | |||
189 | @NETFS_RENAME | ||
190 | Used to rename object. | ||
191 | Attached data is formed into following string: "old_path|new_path". | ||
192 | |||
193 | @id - local inode number. | ||
194 | @start - parent inode number. | ||
195 | @size - length of the above string. | ||
196 | @ext - length of the old path part. | ||
197 | |||
198 | |||
199 | @NETFS_CAPABILITIES | ||
200 | Used to exchange crypto capabilities with server. | ||
201 | If crypto capabilities are not supported by server, then client will disable it | ||
202 | or fail (if 'crypto_fail_unsupported' mount options was specified). | ||
203 | |||
204 | @id - superblock index. Used to specify crypto information for group of servers. | ||
205 | @size - size of the attached capabilities structure. | ||
206 | @start - 0. | ||
207 | @size - 0. | ||
208 | @scsize - 0. | ||
209 | |||
210 | @NETFS_LOCK | ||
211 | Used to send lock request/release messages. Although it sends byte range request | ||
212 | and is capable of flushing pages based on that, it is not used, since all Linux | ||
213 | filesystems lock the whole inode. | ||
214 | |||
215 | @id - lock generation number. | ||
216 | @start - start of the locked range. | ||
217 | @size - size of the locked range. | ||
218 | @ext - lock type: read/write. Not used actually. 15'th bit is used to determine, | ||
219 | if it is lock request (1) or release (0). | ||
220 | |||
221 | @NETFS_XATTR_SET | ||
222 | @NETFS_XATTR_GET | ||
223 | Used to set/get extended attributes for given inode. | ||
224 | @id - attribute generation number or xattr setting type | ||
225 | @start - size of the attribute (request or attached) | ||
226 | @size - name length, path len and data size for given attribute | ||
227 | @ext - path length for given object | ||
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a87be42f8211..ce84cfc9eae0 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -5,6 +5,7 @@ | |||
5 | Bodo Bauer <bb@ricochet.net> | 5 | Bodo Bauer <bb@ricochet.net> |
6 | 6 | ||
7 | 2.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 | 7 | 2.4.x update Jorge Nerin <comandante@zaralinux.com> November 14 2000 |
8 | move /proc/sys Shen Feng <shen@cn.fujitsu.com> April 1 2009 | ||
8 | ------------------------------------------------------------------------------ | 9 | ------------------------------------------------------------------------------ |
9 | Version 1.3 Kernel version 2.2.12 | 10 | Version 1.3 Kernel version 2.2.12 |
10 | Kernel version 2.4.0-test11-pre4 | 11 | Kernel version 2.4.0-test11-pre4 |
@@ -26,25 +27,17 @@ Table of Contents | |||
26 | 1.6 Parallel port info in /proc/parport | 27 | 1.6 Parallel port info in /proc/parport |
27 | 1.7 TTY info in /proc/tty | 28 | 1.7 TTY info in /proc/tty |
28 | 1.8 Miscellaneous kernel statistics in /proc/stat | 29 | 1.8 Miscellaneous kernel statistics in /proc/stat |
30 | 1.9 Ext4 file system parameters | ||
29 | 31 | ||
30 | 2 Modifying System Parameters | 32 | 2 Modifying System Parameters |
31 | 2.1 /proc/sys/fs - File system data | 33 | |
32 | 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats | 34 | 3 Per-Process Parameters |
33 | 2.3 /proc/sys/kernel - general kernel parameters | 35 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
34 | 2.4 /proc/sys/vm - The virtual memory subsystem | 36 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
35 | 2.5 /proc/sys/dev - Device specific parameters | 37 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
36 | 2.6 /proc/sys/sunrpc - Remote procedure calls | 38 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings |
37 | 2.7 /proc/sys/net - Networking stuff | 39 | 3.5 /proc/<pid>/mountinfo - Information about mounts |
38 | 2.8 /proc/sys/net/ipv4 - IPV4 settings | 40 | |
39 | 2.9 Appletalk | ||
40 | 2.10 IPX | ||
41 | 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem | ||
42 | 2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score | ||
43 | 2.13 /proc/<pid>/oom_score - Display current oom-killer score | ||
44 | 2.14 /proc/<pid>/io - Display the IO accounting fields | ||
45 | 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings | ||
46 | 2.16 /proc/<pid>/mountinfo - Information about mounts | ||
47 | 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface | ||
48 | 41 | ||
49 | ------------------------------------------------------------------------------ | 42 | ------------------------------------------------------------------------------ |
50 | Preface | 43 | Preface |
@@ -940,27 +933,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname> | |||
940 | File Content | 933 | File Content |
941 | mb_groups details of multiblock allocator buddy cache of free blocks | 934 | mb_groups details of multiblock allocator buddy cache of free blocks |
942 | mb_history multiblock allocation history | 935 | mb_history multiblock allocation history |
943 | stats controls whether the multiblock allocator should start | ||
944 | collecting statistics, which are shown during the unmount | ||
945 | group_prealloc the multiblock allocator will round up allocation | ||
946 | requests to a multiple of this tuning parameter if the | ||
947 | stripe size is not set in the ext4 superblock | ||
948 | max_to_scan The maximum number of extents the multiblock allocator | ||
949 | will search to find the best extent | ||
950 | min_to_scan The minimum number of extents the multiblock allocator | ||
951 | will search to find the best extent | ||
952 | order2_req Tuning parameter which controls the minimum size for | ||
953 | requests (as a power of 2) where the buddy cache is | ||
954 | used | ||
955 | stream_req Files which have fewer blocks than this tunable | ||
956 | parameter will have their blocks allocated out of a | ||
957 | block group specific preallocation pool, so that small | ||
958 | files are packed closely together. Each large file | ||
959 | will have its blocks allocated out of its own unique | ||
960 | preallocation pool. | ||
961 | inode_readahead Tuning parameter which controls the maximum number of | ||
962 | inode table blocks that ext4's inode table readahead | ||
963 | algorithm will pre-read into the buffer cache | ||
964 | .............................................................................. | 936 | .............................................................................. |
965 | 937 | ||
966 | 938 | ||
@@ -1011,1014 +983,24 @@ review the kernel documentation in the directory /usr/src/linux/Documentation. | |||
1011 | This chapter is heavily based on the documentation included in the pre 2.2 | 983 | This chapter is heavily based on the documentation included in the pre 2.2 |
1012 | kernels, and became part of it in version 2.2.1 of the Linux kernel. | 984 | kernels, and became part of it in version 2.2.1 of the Linux kernel. |
1013 | 985 | ||
1014 | 2.1 /proc/sys/fs - File system data | 986 | Please see: Documentation/sysctls/ directory for descriptions of these |
1015 | ----------------------------------- | ||
1016 | |||
1017 | This subdirectory contains specific file system, file handle, inode, dentry | ||
1018 | and quota information. | ||
1019 | |||
1020 | Currently, these files are in /proc/sys/fs: | ||
1021 | |||
1022 | dentry-state | ||
1023 | ------------ | ||
1024 | |||
1025 | Status of the directory cache. Since directory entries are dynamically | ||
1026 | allocated and deallocated, this file indicates the current status. It holds | ||
1027 | six values, in which the last two are not used and are always zero. The others | ||
1028 | are listed in table 2-1. | ||
1029 | |||
1030 | |||
1031 | Table 2-1: Status files of the directory cache | ||
1032 | .............................................................................. | ||
1033 | File Content | ||
1034 | nr_dentry Almost always zero | ||
1035 | nr_unused Number of unused cache entries | ||
1036 | age_limit | ||
1037 | in seconds after the entry may be reclaimed, when memory is short | ||
1038 | want_pages internally | ||
1039 | .............................................................................. | ||
1040 | |||
1041 | dquot-nr and dquot-max | ||
1042 | ---------------------- | ||
1043 | |||
1044 | The file dquot-max shows the maximum number of cached disk quota entries. | ||
1045 | |||
1046 | The file dquot-nr shows the number of allocated disk quota entries and the | ||
1047 | number of free disk quota entries. | ||
1048 | |||
1049 | If the number of available cached disk quotas is very low and you have a large | ||
1050 | number of simultaneous system users, you might want to raise the limit. | ||
1051 | |||
1052 | file-nr and file-max | ||
1053 | -------------------- | ||
1054 | |||
1055 | The kernel allocates file handles dynamically, but doesn't free them again at | ||
1056 | this time. | ||
1057 | |||
1058 | The value in file-max denotes the maximum number of file handles that the | ||
1059 | Linux kernel will allocate. When you get a lot of error messages about running | ||
1060 | out of file handles, you might want to raise this limit. The default value is | ||
1061 | 10% of RAM in kilobytes. To change it, just write the new number into the | ||
1062 | file: | ||
1063 | |||
1064 | # cat /proc/sys/fs/file-max | ||
1065 | 4096 | ||
1066 | # echo 8192 > /proc/sys/fs/file-max | ||
1067 | # cat /proc/sys/fs/file-max | ||
1068 | 8192 | ||
1069 | |||
1070 | |||
1071 | This method of revision is useful for all customizable parameters of the | ||
1072 | kernel - simply echo the new value to the corresponding file. | ||
1073 | |||
1074 | Historically, the three values in file-nr denoted the number of allocated file | ||
1075 | handles, the number of allocated but unused file handles, and the maximum | ||
1076 | number of file handles. Linux 2.6 always reports 0 as the number of free file | ||
1077 | handles -- this is not an error, it just means that the number of allocated | ||
1078 | file handles exactly matches the number of used file handles. | ||
1079 | |||
1080 | Attempts to allocate more file descriptors than file-max are reported with | ||
1081 | printk, look for "VFS: file-max limit <number> reached". | ||
1082 | |||
1083 | inode-state and inode-nr | ||
1084 | ------------------------ | ||
1085 | |||
1086 | The file inode-nr contains the first two items from inode-state, so we'll skip | ||
1087 | to that file... | ||
1088 | |||
1089 | inode-state contains two actual numbers and five dummy values. The numbers | ||
1090 | are nr_inodes and nr_free_inodes (in order of appearance). | ||
1091 | |||
1092 | nr_inodes | ||
1093 | ~~~~~~~~~ | ||
1094 | |||
1095 | Denotes the number of inodes the system has allocated. This number will | ||
1096 | grow and shrink dynamically. | ||
1097 | |||
1098 | nr_open | ||
1099 | ------- | ||
1100 | |||
1101 | Denotes the maximum number of file-handles a process can | ||
1102 | allocate. Default value is 1024*1024 (1048576) which should be | ||
1103 | enough for most machines. Actual limit depends on RLIMIT_NOFILE | ||
1104 | resource limit. | ||
1105 | |||
1106 | nr_free_inodes | ||
1107 | -------------- | ||
1108 | |||
1109 | Represents the number of free inodes. Ie. The number of inuse inodes is | ||
1110 | (nr_inodes - nr_free_inodes). | ||
1111 | |||
1112 | aio-nr and aio-max-nr | ||
1113 | --------------------- | ||
1114 | |||
1115 | aio-nr is the running total of the number of events specified on the | ||
1116 | io_setup system call for all currently active aio contexts. If aio-nr | ||
1117 | reaches aio-max-nr then io_setup will fail with EAGAIN. Note that | ||
1118 | raising aio-max-nr does not result in the pre-allocation or re-sizing | ||
1119 | of any kernel data structures. | ||
1120 | |||
1121 | 2.2 /proc/sys/fs/binfmt_misc - Miscellaneous binary formats | ||
1122 | ----------------------------------------------------------- | ||
1123 | |||
1124 | Besides these files, there is the subdirectory /proc/sys/fs/binfmt_misc. This | ||
1125 | handles the kernel support for miscellaneous binary formats. | ||
1126 | |||
1127 | Binfmt_misc provides the ability to register additional binary formats to the | ||
1128 | Kernel without compiling an additional module/kernel. Therefore, binfmt_misc | ||
1129 | needs to know magic numbers at the beginning or the filename extension of the | ||
1130 | binary. | ||
1131 | |||
1132 | It works by maintaining a linked list of structs that contain a description of | ||
1133 | a binary format, including a magic with size (or the filename extension), | ||
1134 | offset and mask, and the interpreter name. On request it invokes the given | ||
1135 | interpreter with the original program as argument, as binfmt_java and | ||
1136 | binfmt_em86 and binfmt_mz do. Since binfmt_misc does not define any default | ||
1137 | binary-formats, you have to register an additional binary-format. | ||
1138 | |||
1139 | There are two general files in binfmt_misc and one file per registered format. | ||
1140 | The two general files are register and status. | ||
1141 | |||
1142 | Registering a new binary format | ||
1143 | ------------------------------- | ||
1144 | |||
1145 | To register a new binary format you have to issue the command | ||
1146 | |||
1147 | echo :name:type:offset:magic:mask:interpreter: > /proc/sys/fs/binfmt_misc/register | ||
1148 | |||
1149 | |||
1150 | |||
1151 | with appropriate name (the name for the /proc-dir entry), offset (defaults to | ||
1152 | 0, if omitted), magic, mask (which can be omitted, defaults to all 0xff) and | ||
1153 | last but not least, the interpreter that is to be invoked (for example and | ||
1154 | testing /bin/echo). Type can be M for usual magic matching or E for filename | ||
1155 | extension matching (give extension in place of magic). | ||
1156 | |||
1157 | Check or reset the status of the binary format handler | ||
1158 | ------------------------------------------------------ | ||
1159 | |||
1160 | If you do a cat on the file /proc/sys/fs/binfmt_misc/status, you will get the | ||
1161 | current status (enabled/disabled) of binfmt_misc. Change the status by echoing | ||
1162 | 0 (disables) or 1 (enables) or -1 (caution: this clears all previously | ||
1163 | registered binary formats) to status. For example echo 0 > status to disable | ||
1164 | binfmt_misc (temporarily). | ||
1165 | |||
1166 | Status of a single handler | ||
1167 | -------------------------- | ||
1168 | |||
1169 | Each registered handler has an entry in /proc/sys/fs/binfmt_misc. These files | ||
1170 | perform the same function as status, but their scope is limited to the actual | ||
1171 | binary format. By cating this file, you also receive all related information | ||
1172 | about the interpreter/magic of the binfmt. | ||
1173 | |||
1174 | Example usage of binfmt_misc (emulate binfmt_java) | ||
1175 | -------------------------------------------------- | ||
1176 | |||
1177 | cd /proc/sys/fs/binfmt_misc | ||
1178 | echo ':Java:M::\xca\xfe\xba\xbe::/usr/local/java/bin/javawrapper:' > register | ||
1179 | echo ':HTML:E::html::/usr/local/java/bin/appletviewer:' > register | ||
1180 | echo ':Applet:M::<!--applet::/usr/local/java/bin/appletviewer:' > register | ||
1181 | echo ':DEXE:M::\x0eDEX::/usr/bin/dosexec:' > register | ||
1182 | |||
1183 | |||
1184 | These four lines add support for Java executables and Java applets (like | ||
1185 | binfmt_java, additionally recognizing the .html extension with no need to put | ||
1186 | <!--applet> to every applet file). You have to install the JDK and the | ||
1187 | shell-script /usr/local/java/bin/javawrapper too. It works around the | ||
1188 | brokenness of the Java filename handling. To add a Java binary, just create a | ||
1189 | link to the class-file somewhere in the path. | ||
1190 | |||
1191 | 2.3 /proc/sys/kernel - general kernel parameters | ||
1192 | ------------------------------------------------ | ||
1193 | |||
1194 | This directory reflects general kernel behaviors. As I've said before, the | ||
1195 | contents depend on your configuration. Here you'll find the most important | ||
1196 | files, along with descriptions of what they mean and how to use them. | ||
1197 | |||
1198 | acct | ||
1199 | ---- | ||
1200 | |||
1201 | The file contains three values; highwater, lowwater, and frequency. | ||
1202 | |||
1203 | It exists only when BSD-style process accounting is enabled. These values | ||
1204 | control its behavior. If the free space on the file system where the log lives | ||
1205 | goes below lowwater percentage, accounting suspends. If it goes above | ||
1206 | highwater percentage, accounting resumes. Frequency determines how often you | ||
1207 | check the amount of free space (value is in seconds). Default settings are: 4, | ||
1208 | 2, and 30. That is, suspend accounting if there is less than 2 percent free; | ||
1209 | resume it if we have a value of 3 or more percent; consider information about | ||
1210 | the amount of free space valid for 30 seconds | ||
1211 | |||
1212 | ctrl-alt-del | ||
1213 | ------------ | ||
1214 | |||
1215 | When the value in this file is 0, ctrl-alt-del is trapped and sent to the init | ||
1216 | program to handle a graceful restart. However, when the value is greater that | ||
1217 | zero, Linux's reaction to this key combination will be an immediate reboot, | ||
1218 | without syncing its dirty buffers. | ||
1219 | |||
1220 | [NOTE] | ||
1221 | When a program (like dosemu) has the keyboard in raw mode, the | ||
1222 | ctrl-alt-del is intercepted by the program before it ever reaches the | ||
1223 | kernel tty layer, and it is up to the program to decide what to do with | ||
1224 | it. | ||
1225 | |||
1226 | domainname and hostname | ||
1227 | ----------------------- | ||
1228 | |||
1229 | These files can be controlled to set the NIS domainname and hostname of your | ||
1230 | box. For the classic darkstar.frop.org a simple: | ||
1231 | |||
1232 | # echo "darkstar" > /proc/sys/kernel/hostname | ||
1233 | # echo "frop.org" > /proc/sys/kernel/domainname | ||
1234 | |||
1235 | |||
1236 | would suffice to set your hostname and NIS domainname. | ||
1237 | |||
1238 | osrelease, ostype and version | ||
1239 | ----------------------------- | ||
1240 | |||
1241 | The names make it pretty obvious what these fields contain: | ||
1242 | |||
1243 | > cat /proc/sys/kernel/osrelease | ||
1244 | 2.2.12 | ||
1245 | |||
1246 | > cat /proc/sys/kernel/ostype | ||
1247 | Linux | ||
1248 | |||
1249 | > cat /proc/sys/kernel/version | ||
1250 | #4 Fri Oct 1 12:41:14 PDT 1999 | ||
1251 | |||
1252 | |||
1253 | The files osrelease and ostype should be clear enough. Version needs a little | ||
1254 | more clarification. The #4 means that this is the 4th kernel built from this | ||
1255 | source base and the date after it indicates the time the kernel was built. The | ||
1256 | only way to tune these values is to rebuild the kernel. | ||
1257 | |||
1258 | panic | ||
1259 | ----- | ||
1260 | |||
1261 | The value in this file represents the number of seconds the kernel waits | ||
1262 | before rebooting on a panic. When you use the software watchdog, the | ||
1263 | recommended setting is 60. If set to 0, the auto reboot after a kernel panic | ||
1264 | is disabled, which is the default setting. | ||
1265 | |||
1266 | printk | ||
1267 | ------ | ||
1268 | |||
1269 | The four values in printk denote | ||
1270 | * console_loglevel, | ||
1271 | * default_message_loglevel, | ||
1272 | * minimum_console_loglevel and | ||
1273 | * default_console_loglevel | ||
1274 | respectively. | ||
1275 | |||
1276 | These values influence printk() behavior when printing or logging error | ||
1277 | messages, which come from inside the kernel. See syslog(2) for more | ||
1278 | information on the different log levels. | ||
1279 | |||
1280 | console_loglevel | ||
1281 | ---------------- | ||
1282 | |||
1283 | Messages with a higher priority than this will be printed to the console. | ||
1284 | |||
1285 | default_message_level | ||
1286 | --------------------- | ||
1287 | |||
1288 | Messages without an explicit priority will be printed with this priority. | ||
1289 | |||
1290 | minimum_console_loglevel | ||
1291 | ------------------------ | ||
1292 | |||
1293 | Minimum (highest) value to which the console_loglevel can be set. | ||
1294 | |||
1295 | default_console_loglevel | ||
1296 | ------------------------ | ||
1297 | |||
1298 | Default value for console_loglevel. | ||
1299 | |||
1300 | sg-big-buff | ||
1301 | ----------- | ||
1302 | |||
1303 | This file shows the size of the generic SCSI (sg) buffer. At this point, you | ||
1304 | can't tune it yet, but you can change it at compile time by editing | ||
1305 | include/scsi/sg.h and changing the value of SG_BIG_BUFF. | ||
1306 | |||
1307 | If you use a scanner with SANE (Scanner Access Now Easy) you might want to set | ||
1308 | this to a higher value. Refer to the SANE documentation on this issue. | ||
1309 | |||
1310 | modprobe | ||
1311 | -------- | ||
1312 | |||
1313 | The location where the modprobe binary is located. The kernel uses this | ||
1314 | program to load modules on demand. | ||
1315 | |||
1316 | unknown_nmi_panic | ||
1317 | ----------------- | ||
1318 | |||
1319 | The value in this file affects behavior of handling NMI. When the value is | ||
1320 | non-zero, unknown NMI is trapped and then panic occurs. At that time, kernel | ||
1321 | debugging information is displayed on console. | ||
1322 | |||
1323 | NMI switch that most IA32 servers have fires unknown NMI up, for example. | ||
1324 | If a system hangs up, try pressing the NMI switch. | ||
1325 | |||
1326 | panic_on_unrecovered_nmi | ||
1327 | ------------------------ | ||
1328 | |||
1329 | The default Linux behaviour on an NMI of either memory or unknown is to continue | ||
1330 | operation. For many environments such as scientific computing it is preferable | ||
1331 | that the box is taken out and the error dealt with than an uncorrected | ||
1332 | parity/ECC error get propogated. | ||
1333 | |||
1334 | A small number of systems do generate NMI's for bizarre random reasons such as | ||
1335 | power management so the default is off. That sysctl works like the existing | ||
1336 | panic controls already in that directory. | ||
1337 | |||
1338 | nmi_watchdog | ||
1339 | ------------ | ||
1340 | |||
1341 | Enables/Disables the NMI watchdog on x86 systems. When the value is non-zero | ||
1342 | the NMI watchdog is enabled and will continuously test all online cpus to | ||
1343 | determine whether or not they are still functioning properly. Currently, | ||
1344 | passing "nmi_watchdog=" parameter at boot time is required for this function | ||
1345 | to work. | ||
1346 | |||
1347 | If LAPIC NMI watchdog method is in use (nmi_watchdog=2 kernel parameter), the | ||
1348 | NMI watchdog shares registers with oprofile. By disabling the NMI watchdog, | ||
1349 | oprofile may have more registers to utilize. | ||
1350 | |||
1351 | msgmni | ||
1352 | ------ | ||
1353 | |||
1354 | Maximum number of message queue ids on the system. | ||
1355 | This value scales to the amount of lowmem. It is automatically recomputed | ||
1356 | upon memory add/remove or ipc namespace creation/removal. | ||
1357 | When a value is written into this file, msgmni's value becomes fixed, i.e. it | ||
1358 | is not recomputed anymore when one of the above events occurs. | ||
1359 | Use auto_msgmni to change this behavior. | ||
1360 | |||
1361 | auto_msgmni | ||
1362 | ----------- | ||
1363 | |||
1364 | Enables/Disables automatic recomputing of msgmni upon memory add/remove or | ||
1365 | upon ipc namespace creation/removal (see the msgmni description above). | ||
1366 | Echoing "1" into this file enables msgmni automatic recomputing. | ||
1367 | Echoing "0" turns it off. | ||
1368 | auto_msgmni default value is 1. | ||
1369 | |||
1370 | |||
1371 | 2.4 /proc/sys/vm - The virtual memory subsystem | ||
1372 | ----------------------------------------------- | ||
1373 | |||
1374 | Please see: Documentation/sysctls/vm.txt for a description of these | ||
1375 | entries. | 987 | entries. |
1376 | 988 | ||
989 | ------------------------------------------------------------------------------ | ||
990 | Summary | ||
991 | ------------------------------------------------------------------------------ | ||
992 | Certain aspects of kernel behavior can be modified at runtime, without the | ||
993 | need to recompile the kernel, or even to reboot the system. The files in the | ||
994 | /proc/sys tree can not only be read, but also modified. You can use the echo | ||
995 | command to write value into these files, thereby changing the default settings | ||
996 | of the kernel. | ||
997 | ------------------------------------------------------------------------------ | ||
1377 | 998 | ||
1378 | 2.5 /proc/sys/dev - Device specific parameters | 999 | ------------------------------------------------------------------------------ |
1379 | ---------------------------------------------- | 1000 | CHAPTER 3: PER-PROCESS PARAMETERS |
1380 | 1001 | ------------------------------------------------------------------------------ | |
1381 | Currently there is only support for CDROM drives, and for those, there is only | ||
1382 | one read-only file containing information about the CD-ROM drives attached to | ||
1383 | the system: | ||
1384 | |||
1385 | >cat /proc/sys/dev/cdrom/info | ||
1386 | CD-ROM information, Id: cdrom.c 2.55 1999/04/25 | ||
1387 | |||
1388 | drive name: sr0 hdb | ||
1389 | drive speed: 32 40 | ||
1390 | drive # of slots: 1 0 | ||
1391 | Can close tray: 1 1 | ||
1392 | Can open tray: 1 1 | ||
1393 | Can lock tray: 1 1 | ||
1394 | Can change speed: 1 1 | ||
1395 | Can select disk: 0 1 | ||
1396 | Can read multisession: 1 1 | ||
1397 | Can read MCN: 1 1 | ||
1398 | Reports media changed: 1 1 | ||
1399 | Can play audio: 1 1 | ||
1400 | |||
1401 | |||
1402 | You see two drives, sr0 and hdb, along with a list of their features. | ||
1403 | |||
1404 | 2.6 /proc/sys/sunrpc - Remote procedure calls | ||
1405 | --------------------------------------------- | ||
1406 | |||
1407 | This directory contains four files, which enable or disable debugging for the | ||
1408 | RPC functions NFS, NFS-daemon, RPC and NLM. The default values are 0. They can | ||
1409 | be set to one to turn debugging on. (The default value is 0 for each) | ||
1410 | |||
1411 | 2.7 /proc/sys/net - Networking stuff | ||
1412 | ------------------------------------ | ||
1413 | |||
1414 | The interface to the networking parts of the kernel is located in | ||
1415 | /proc/sys/net. Table 2-3 shows all possible subdirectories. You may see only | ||
1416 | some of them, depending on your kernel's configuration. | ||
1417 | |||
1418 | |||
1419 | Table 2-3: Subdirectories in /proc/sys/net | ||
1420 | .............................................................................. | ||
1421 | Directory Content Directory Content | ||
1422 | core General parameter appletalk Appletalk protocol | ||
1423 | unix Unix domain sockets netrom NET/ROM | ||
1424 | 802 E802 protocol ax25 AX25 | ||
1425 | ethernet Ethernet protocol rose X.25 PLP layer | ||
1426 | ipv4 IP version 4 x25 X.25 protocol | ||
1427 | ipx IPX token-ring IBM token ring | ||
1428 | bridge Bridging decnet DEC net | ||
1429 | ipv6 IP version 6 | ||
1430 | .............................................................................. | ||
1431 | |||
1432 | We will concentrate on IP networking here. Since AX15, X.25, and DEC Net are | ||
1433 | only minor players in the Linux world, we'll skip them in this chapter. You'll | ||
1434 | find some short info on Appletalk and IPX further on in this chapter. Review | ||
1435 | the online documentation and the kernel source to get a detailed view of the | ||
1436 | parameters for those protocols. In this section we'll discuss the | ||
1437 | subdirectories printed in bold letters in the table above. As default values | ||
1438 | are suitable for most needs, there is no need to change these values. | ||
1439 | |||
1440 | /proc/sys/net/core - Network core options | ||
1441 | ----------------------------------------- | ||
1442 | |||
1443 | rmem_default | ||
1444 | ------------ | ||
1445 | |||
1446 | The default setting of the socket receive buffer in bytes. | ||
1447 | |||
1448 | rmem_max | ||
1449 | -------- | ||
1450 | |||
1451 | The maximum receive socket buffer size in bytes. | ||
1452 | |||
1453 | wmem_default | ||
1454 | ------------ | ||
1455 | |||
1456 | The default setting (in bytes) of the socket send buffer. | ||
1457 | |||
1458 | wmem_max | ||
1459 | -------- | ||
1460 | |||
1461 | The maximum send socket buffer size in bytes. | ||
1462 | |||
1463 | message_burst and message_cost | ||
1464 | ------------------------------ | ||
1465 | |||
1466 | These parameters are used to limit the warning messages written to the kernel | ||
1467 | log from the networking code. They enforce a rate limit to make a | ||
1468 | denial-of-service attack impossible. A higher message_cost factor, results in | ||
1469 | fewer messages that will be written. Message_burst controls when messages will | ||
1470 | be dropped. The default settings limit warning messages to one every five | ||
1471 | seconds. | ||
1472 | |||
1473 | warnings | ||
1474 | -------- | ||
1475 | |||
1476 | This controls console messages from the networking stack that can occur because | ||
1477 | of problems on the network like duplicate address or bad checksums. Normally, | ||
1478 | this should be enabled, but if the problem persists the messages can be | ||
1479 | disabled. | ||
1480 | |||
1481 | |||
1482 | netdev_max_backlog | ||
1483 | ------------------ | ||
1484 | |||
1485 | Maximum number of packets, queued on the INPUT side, when the interface | ||
1486 | receives packets faster than kernel can process them. | ||
1487 | |||
1488 | optmem_max | ||
1489 | ---------- | ||
1490 | |||
1491 | Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence | ||
1492 | of struct cmsghdr structures with appended data. | ||
1493 | |||
1494 | /proc/sys/net/unix - Parameters for Unix domain sockets | ||
1495 | ------------------------------------------------------- | ||
1496 | |||
1497 | There are only two files in this subdirectory. They control the delays for | ||
1498 | deleting and destroying socket descriptors. | ||
1499 | |||
1500 | 2.8 /proc/sys/net/ipv4 - IPV4 settings | ||
1501 | -------------------------------------- | ||
1502 | |||
1503 | IP version 4 is still the most used protocol in Unix networking. It will be | ||
1504 | replaced by IP version 6 in the next couple of years, but for the moment it's | ||
1505 | the de facto standard for the internet and is used in most networking | ||
1506 | environments around the world. Because of the importance of this protocol, | ||
1507 | we'll have a deeper look into the subtree controlling the behavior of the IPv4 | ||
1508 | subsystem of the Linux kernel. | ||
1509 | |||
1510 | Let's start with the entries in /proc/sys/net/ipv4. | ||
1511 | |||
1512 | ICMP settings | ||
1513 | ------------- | ||
1514 | |||
1515 | icmp_echo_ignore_all and icmp_echo_ignore_broadcasts | ||
1516 | ---------------------------------------------------- | ||
1517 | |||
1518 | Turn on (1) or off (0), if the kernel should ignore all ICMP ECHO requests, or | ||
1519 | just those to broadcast and multicast addresses. | ||
1520 | |||
1521 | Please note that if you accept ICMP echo requests with a broadcast/multi\-cast | ||
1522 | destination address your network may be used as an exploder for denial of | ||
1523 | service packet flooding attacks to other hosts. | ||
1524 | |||
1525 | icmp_destunreach_rate, icmp_echoreply_rate, icmp_paramprob_rate and icmp_timeexeed_rate | ||
1526 | --------------------------------------------------------------------------------------- | ||
1527 | |||
1528 | Sets limits for sending ICMP packets to specific targets. A value of zero | ||
1529 | disables all limiting. Any positive value sets the maximum package rate in | ||
1530 | hundredth of a second (on Intel systems). | ||
1531 | |||
1532 | IP settings | ||
1533 | ----------- | ||
1534 | |||
1535 | ip_autoconfig | ||
1536 | ------------- | ||
1537 | |||
1538 | This file contains the number one if the host received its IP configuration by | ||
1539 | RARP, BOOTP, DHCP or a similar mechanism. Otherwise it is zero. | ||
1540 | |||
1541 | ip_default_ttl | ||
1542 | -------------- | ||
1543 | |||
1544 | TTL (Time To Live) for IPv4 interfaces. This is simply the maximum number of | ||
1545 | hops a packet may travel. | ||
1546 | |||
1547 | ip_dynaddr | ||
1548 | ---------- | ||
1549 | |||
1550 | Enable dynamic socket address rewriting on interface address change. This is | ||
1551 | useful for dialup interface with changing IP addresses. | ||
1552 | |||
1553 | ip_forward | ||
1554 | ---------- | ||
1555 | |||
1556 | Enable or disable forwarding of IP packages between interfaces. Changing this | ||
1557 | value resets all other parameters to their default values. They differ if the | ||
1558 | kernel is configured as host or router. | ||
1559 | |||
1560 | ip_local_port_range | ||
1561 | ------------------- | ||
1562 | |||
1563 | Range of ports used by TCP and UDP to choose the local port. Contains two | ||
1564 | numbers, the first number is the lowest port, the second number the highest | ||
1565 | local port. Default is 1024-4999. Should be changed to 32768-61000 for | ||
1566 | high-usage systems. | ||
1567 | |||
1568 | ip_no_pmtu_disc | ||
1569 | --------------- | ||
1570 | |||
1571 | Global switch to turn path MTU discovery off. It can also be set on a per | ||
1572 | socket basis by the applications or on a per route basis. | ||
1573 | |||
1574 | ip_masq_debug | ||
1575 | ------------- | ||
1576 | |||
1577 | Enable/disable debugging of IP masquerading. | ||
1578 | |||
1579 | IP fragmentation settings | ||
1580 | ------------------------- | ||
1581 | |||
1582 | ipfrag_high_trash and ipfrag_low_trash | ||
1583 | -------------------------------------- | ||
1584 | |||
1585 | Maximum memory used to reassemble IP fragments. When ipfrag_high_thresh bytes | ||
1586 | of memory is allocated for this purpose, the fragment handler will toss | ||
1587 | packets until ipfrag_low_thresh is reached. | ||
1588 | |||
1589 | ipfrag_time | ||
1590 | ----------- | ||
1591 | |||
1592 | Time in seconds to keep an IP fragment in memory. | ||
1593 | |||
1594 | TCP settings | ||
1595 | ------------ | ||
1596 | |||
1597 | tcp_ecn | ||
1598 | ------- | ||
1599 | |||
1600 | This file controls the use of the ECN bit in the IPv4 headers. This is a new | ||
1601 | feature about Explicit Congestion Notification, but some routers and firewalls | ||
1602 | block traffic that has this bit set, so it could be necessary to echo 0 to | ||
1603 | /proc/sys/net/ipv4/tcp_ecn if you want to talk to these sites. For more info | ||
1604 | you could read RFC2481. | ||
1605 | |||
1606 | tcp_retrans_collapse | ||
1607 | -------------------- | ||
1608 | |||
1609 | Bug-to-bug compatibility with some broken printers. On retransmit, try to send | ||
1610 | larger packets to work around bugs in certain TCP stacks. Can be turned off by | ||
1611 | setting it to zero. | ||
1612 | |||
1613 | tcp_keepalive_probes | ||
1614 | -------------------- | ||
1615 | |||
1616 | Number of keep alive probes TCP sends out, until it decides that the | ||
1617 | connection is broken. | ||
1618 | |||
1619 | tcp_keepalive_time | ||
1620 | ------------------ | ||
1621 | |||
1622 | How often TCP sends out keep alive messages, when keep alive is enabled. The | ||
1623 | default is 2 hours. | ||
1624 | |||
1625 | tcp_syn_retries | ||
1626 | --------------- | ||
1627 | |||
1628 | Number of times initial SYNs for a TCP connection attempt will be | ||
1629 | retransmitted. Should not be higher than 255. This is only the timeout for | ||
1630 | outgoing connections, for incoming connections the number of retransmits is | ||
1631 | defined by tcp_retries1. | ||
1632 | |||
1633 | tcp_sack | ||
1634 | -------- | ||
1635 | |||
1636 | Enable select acknowledgments after RFC2018. | ||
1637 | |||
1638 | tcp_timestamps | ||
1639 | -------------- | ||
1640 | |||
1641 | Enable timestamps as defined in RFC1323. | ||
1642 | |||
1643 | tcp_stdurg | ||
1644 | ---------- | ||
1645 | |||
1646 | Enable the strict RFC793 interpretation of the TCP urgent pointer field. The | ||
1647 | default is to use the BSD compatible interpretation of the urgent pointer | ||
1648 | pointing to the first byte after the urgent data. The RFC793 interpretation is | ||
1649 | to have it point to the last byte of urgent data. Enabling this option may | ||
1650 | lead to interoperability problems. Disabled by default. | ||
1651 | |||
1652 | tcp_syncookies | ||
1653 | -------------- | ||
1654 | |||
1655 | Only valid when the kernel was compiled with CONFIG_SYNCOOKIES. Send out | ||
1656 | syncookies when the syn backlog queue of a socket overflows. This is to ward | ||
1657 | off the common 'syn flood attack'. Disabled by default. | ||
1658 | |||
1659 | Note that the concept of a socket backlog is abandoned. This means the peer | ||
1660 | may not receive reliable error messages from an over loaded server with | ||
1661 | syncookies enabled. | ||
1662 | |||
1663 | tcp_window_scaling | ||
1664 | ------------------ | ||
1665 | |||
1666 | Enable window scaling as defined in RFC1323. | ||
1667 | |||
1668 | tcp_fin_timeout | ||
1669 | --------------- | ||
1670 | |||
1671 | The length of time in seconds it takes to receive a final FIN before the | ||
1672 | socket is always closed. This is strictly a violation of the TCP | ||
1673 | specification, but required to prevent denial-of-service attacks. | ||
1674 | |||
1675 | tcp_max_ka_probes | ||
1676 | ----------------- | ||
1677 | |||
1678 | Indicates how many keep alive probes are sent per slow timer run. Should not | ||
1679 | be set too high to prevent bursts. | ||
1680 | |||
1681 | tcp_max_syn_backlog | ||
1682 | ------------------- | ||
1683 | |||
1684 | Length of the per socket backlog queue. Since Linux 2.2 the backlog specified | ||
1685 | in listen(2) only specifies the length of the backlog queue of already | ||
1686 | established sockets. When more connection requests arrive Linux starts to drop | ||
1687 | packets. When syncookies are enabled the packets are still answered and the | ||
1688 | maximum queue is effectively ignored. | ||
1689 | |||
1690 | tcp_retries1 | ||
1691 | ------------ | ||
1692 | |||
1693 | Defines how often an answer to a TCP connection request is retransmitted | ||
1694 | before giving up. | ||
1695 | |||
1696 | tcp_retries2 | ||
1697 | ------------ | ||
1698 | |||
1699 | Defines how often a TCP packet is retransmitted before giving up. | ||
1700 | |||
1701 | Interface specific settings | ||
1702 | --------------------------- | ||
1703 | |||
1704 | In the directory /proc/sys/net/ipv4/conf you'll find one subdirectory for each | ||
1705 | interface the system knows about and one directory calls all. Changes in the | ||
1706 | all subdirectory affect all interfaces, whereas changes in the other | ||
1707 | subdirectories affect only one interface. All directories have the same | ||
1708 | entries: | ||
1709 | |||
1710 | accept_redirects | ||
1711 | ---------------- | ||
1712 | |||
1713 | This switch decides if the kernel accepts ICMP redirect messages or not. The | ||
1714 | default is 'yes' if the kernel is configured for a regular host and 'no' for a | ||
1715 | router configuration. | ||
1716 | |||
1717 | accept_source_route | ||
1718 | ------------------- | ||
1719 | |||
1720 | Should source routed packages be accepted or declined. The default is | ||
1721 | dependent on the kernel configuration. It's 'yes' for routers and 'no' for | ||
1722 | hosts. | ||
1723 | |||
1724 | bootp_relay | ||
1725 | ~~~~~~~~~~~ | ||
1726 | |||
1727 | Accept packets with source address 0.b.c.d with destinations not to this host | ||
1728 | as local ones. It is supposed that a BOOTP relay daemon will catch and forward | ||
1729 | such packets. | ||
1730 | |||
1731 | The default is 0, since this feature is not implemented yet (kernel version | ||
1732 | 2.2.12). | ||
1733 | |||
1734 | forwarding | ||
1735 | ---------- | ||
1736 | |||
1737 | Enable or disable IP forwarding on this interface. | ||
1738 | |||
1739 | log_martians | ||
1740 | ------------ | ||
1741 | |||
1742 | Log packets with source addresses with no known route to kernel log. | ||
1743 | |||
1744 | mc_forwarding | ||
1745 | ------------- | ||
1746 | |||
1747 | Do multicast routing. The kernel needs to be compiled with CONFIG_MROUTE and a | ||
1748 | multicast routing daemon is required. | ||
1749 | |||
1750 | proxy_arp | ||
1751 | --------- | ||
1752 | |||
1753 | Does (1) or does not (0) perform proxy ARP. | ||
1754 | |||
1755 | rp_filter | ||
1756 | --------- | ||
1757 | |||
1758 | Integer value determines if a source validation should be made. 1 means yes, 0 | ||
1759 | means no. Disabled by default, but local/broadcast address spoofing is always | ||
1760 | on. | ||
1761 | |||
1762 | If you set this to 1 on a router that is the only connection for a network to | ||
1763 | the net, it will prevent spoofing attacks against your internal networks | ||
1764 | (external addresses can still be spoofed), without the need for additional | ||
1765 | firewall rules. | ||
1766 | |||
1767 | secure_redirects | ||
1768 | ---------------- | ||
1769 | |||
1770 | Accept ICMP redirect messages only for gateways, listed in default gateway | ||
1771 | list. Enabled by default. | ||
1772 | |||
1773 | shared_media | ||
1774 | ------------ | ||
1775 | |||
1776 | If it is not set the kernel does not assume that different subnets on this | ||
1777 | device can communicate directly. Default setting is 'yes'. | ||
1778 | |||
1779 | send_redirects | ||
1780 | -------------- | ||
1781 | |||
1782 | Determines whether to send ICMP redirects to other hosts. | ||
1783 | |||
1784 | Routing settings | ||
1785 | ---------------- | ||
1786 | |||
1787 | The directory /proc/sys/net/ipv4/route contains several file to control | ||
1788 | routing issues. | ||
1789 | |||
1790 | error_burst and error_cost | ||
1791 | -------------------------- | ||
1792 | |||
1793 | These parameters are used to limit how many ICMP destination unreachable to | ||
1794 | send from the host in question. ICMP destination unreachable messages are | ||
1795 | sent when we cannot reach the next hop while trying to transmit a packet. | ||
1796 | It will also print some error messages to kernel logs if someone is ignoring | ||
1797 | our ICMP redirects. The higher the error_cost factor is, the fewer | ||
1798 | destination unreachable and error messages will be let through. Error_burst | ||
1799 | controls when destination unreachable messages and error messages will be | ||
1800 | dropped. The default settings limit warning messages to five every second. | ||
1801 | |||
1802 | flush | ||
1803 | ----- | ||
1804 | |||
1805 | Writing to this file results in a flush of the routing cache. | ||
1806 | |||
1807 | gc_elasticity, gc_interval, gc_min_interval_ms, gc_timeout, gc_thresh | ||
1808 | --------------------------------------------------------------------- | ||
1809 | |||
1810 | Values to control the frequency and behavior of the garbage collection | ||
1811 | algorithm for the routing cache. gc_min_interval is deprecated and replaced | ||
1812 | by gc_min_interval_ms. | ||
1813 | |||
1814 | |||
1815 | max_size | ||
1816 | -------- | ||
1817 | |||
1818 | Maximum size of the routing cache. Old entries will be purged once the cache | ||
1819 | reached has this size. | ||
1820 | |||
1821 | redirect_load, redirect_number | ||
1822 | ------------------------------ | ||
1823 | |||
1824 | Factors which determine if more ICPM redirects should be sent to a specific | ||
1825 | host. No redirects will be sent once the load limit or the maximum number of | ||
1826 | redirects has been reached. | ||
1827 | |||
1828 | redirect_silence | ||
1829 | ---------------- | ||
1830 | |||
1831 | Timeout for redirects. After this period redirects will be sent again, even if | ||
1832 | this has been stopped, because the load or number limit has been reached. | ||
1833 | |||
1834 | Network Neighbor handling | ||
1835 | ------------------------- | ||
1836 | |||
1837 | Settings about how to handle connections with direct neighbors (nodes attached | ||
1838 | to the same link) can be found in the directory /proc/sys/net/ipv4/neigh. | ||
1839 | |||
1840 | As we saw it in the conf directory, there is a default subdirectory which | ||
1841 | holds the default values, and one directory for each interface. The contents | ||
1842 | of the directories are identical, with the single exception that the default | ||
1843 | settings contain additional options to set garbage collection parameters. | ||
1844 | |||
1845 | In the interface directories you'll find the following entries: | ||
1846 | |||
1847 | base_reachable_time, base_reachable_time_ms | ||
1848 | ------------------------------------------- | ||
1849 | |||
1850 | A base value used for computing the random reachable time value as specified | ||
1851 | in RFC2461. | ||
1852 | |||
1853 | Expression of base_reachable_time, which is deprecated, is in seconds. | ||
1854 | Expression of base_reachable_time_ms is in milliseconds. | ||
1855 | |||
1856 | retrans_time, retrans_time_ms | ||
1857 | ----------------------------- | ||
1858 | |||
1859 | The time between retransmitted Neighbor Solicitation messages. | ||
1860 | Used for address resolution and to determine if a neighbor is | ||
1861 | unreachable. | ||
1862 | |||
1863 | Expression of retrans_time, which is deprecated, is in 1/100 seconds (for | ||
1864 | IPv4) or in jiffies (for IPv6). | ||
1865 | Expression of retrans_time_ms is in milliseconds. | ||
1866 | |||
1867 | unres_qlen | ||
1868 | ---------- | ||
1869 | |||
1870 | Maximum queue length for a pending arp request - the number of packets which | ||
1871 | are accepted from other layers while the ARP address is still resolved. | ||
1872 | |||
1873 | anycast_delay | ||
1874 | ------------- | ||
1875 | |||
1876 | Maximum for random delay of answers to neighbor solicitation messages in | ||
1877 | jiffies (1/100 sec). Not yet implemented (Linux does not have anycast support | ||
1878 | yet). | ||
1879 | |||
1880 | ucast_solicit | ||
1881 | ------------- | ||
1882 | |||
1883 | Maximum number of retries for unicast solicitation. | ||
1884 | |||
1885 | mcast_solicit | ||
1886 | ------------- | ||
1887 | |||
1888 | Maximum number of retries for multicast solicitation. | ||
1889 | |||
1890 | delay_first_probe_time | ||
1891 | ---------------------- | ||
1892 | |||
1893 | Delay for the first time probe if the neighbor is reachable. (see | ||
1894 | gc_stale_time) | ||
1895 | |||
1896 | locktime | ||
1897 | -------- | ||
1898 | |||
1899 | An ARP/neighbor entry is only replaced with a new one if the old is at least | ||
1900 | locktime old. This prevents ARP cache thrashing. | ||
1901 | |||
1902 | proxy_delay | ||
1903 | ----------- | ||
1904 | |||
1905 | Maximum time (real time is random [0..proxytime]) before answering to an ARP | ||
1906 | request for which we have an proxy ARP entry. In some cases, this is used to | ||
1907 | prevent network flooding. | ||
1908 | |||
1909 | proxy_qlen | ||
1910 | ---------- | ||
1911 | |||
1912 | Maximum queue length of the delayed proxy arp timer. (see proxy_delay). | ||
1913 | |||
1914 | app_solicit | ||
1915 | ---------- | ||
1916 | |||
1917 | Determines the number of requests to send to the user level ARP daemon. Use 0 | ||
1918 | to turn off. | ||
1919 | |||
1920 | gc_stale_time | ||
1921 | ------------- | ||
1922 | |||
1923 | Determines how often to check for stale ARP entries. After an ARP entry is | ||
1924 | stale it will be resolved again (which is useful when an IP address migrates | ||
1925 | to another machine). When ucast_solicit is greater than 0 it first tries to | ||
1926 | send an ARP packet directly to the known host When that fails and | ||
1927 | mcast_solicit is greater than 0, an ARP request is broadcasted. | ||
1928 | |||
1929 | 2.9 Appletalk | ||
1930 | ------------- | ||
1931 | |||
1932 | The /proc/sys/net/appletalk directory holds the Appletalk configuration data | ||
1933 | when Appletalk is loaded. The configurable parameters are: | ||
1934 | |||
1935 | aarp-expiry-time | ||
1936 | ---------------- | ||
1937 | |||
1938 | The amount of time we keep an ARP entry before expiring it. Used to age out | ||
1939 | old hosts. | ||
1940 | |||
1941 | aarp-resolve-time | ||
1942 | ----------------- | ||
1943 | |||
1944 | The amount of time we will spend trying to resolve an Appletalk address. | ||
1945 | |||
1946 | aarp-retransmit-limit | ||
1947 | --------------------- | ||
1948 | |||
1949 | The number of times we will retransmit a query before giving up. | ||
1950 | |||
1951 | aarp-tick-time | ||
1952 | -------------- | ||
1953 | |||
1954 | Controls the rate at which expires are checked. | ||
1955 | |||
1956 | The directory /proc/net/appletalk holds the list of active Appletalk sockets | ||
1957 | on a machine. | ||
1958 | |||
1959 | The fields indicate the DDP type, the local address (in network:node format) | ||
1960 | the remote address, the size of the transmit pending queue, the size of the | ||
1961 | received queue (bytes waiting for applications to read) the state and the uid | ||
1962 | owning the socket. | ||
1963 | |||
1964 | /proc/net/atalk_iface lists all the interfaces configured for appletalk.It | ||
1965 | shows the name of the interface, its Appletalk address, the network range on | ||
1966 | that address (or network number for phase 1 networks), and the status of the | ||
1967 | interface. | ||
1968 | |||
1969 | /proc/net/atalk_route lists each known network route. It lists the target | ||
1970 | (network) that the route leads to, the router (may be directly connected), the | ||
1971 | route flags, and the device the route is using. | ||
1972 | |||
1973 | 2.10 IPX | ||
1974 | -------- | ||
1975 | |||
1976 | The IPX protocol has no tunable values in proc/sys/net. | ||
1977 | |||
1978 | The IPX protocol does, however, provide proc/net/ipx. This lists each IPX | ||
1979 | socket giving the local and remote addresses in Novell format (that is | ||
1980 | network:node:port). In accordance with the strange Novell tradition, | ||
1981 | everything but the port is in hex. Not_Connected is displayed for sockets that | ||
1982 | are not tied to a specific remote address. The Tx and Rx queue sizes indicate | ||
1983 | the number of bytes pending for transmission and reception. The state | ||
1984 | indicates the state the socket is in and the uid is the owning uid of the | ||
1985 | socket. | ||
1986 | |||
1987 | The /proc/net/ipx_interface file lists all IPX interfaces. For each interface | ||
1988 | it gives the network number, the node number, and indicates if the network is | ||
1989 | the primary network. It also indicates which device it is bound to (or | ||
1990 | Internal for internal networks) and the Frame Type if appropriate. Linux | ||
1991 | supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for | ||
1992 | IPX. | ||
1993 | |||
1994 | The /proc/net/ipx_route table holds a list of IPX routes. For each route it | ||
1995 | gives the destination network, the router node (or Directly) and the network | ||
1996 | address of the router (or Connected) for internal networks. | ||
1997 | |||
1998 | 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem | ||
1999 | ---------------------------------------------------------- | ||
2000 | |||
2001 | The "mqueue" filesystem provides the necessary kernel features to enable the | ||
2002 | creation of a user space library that implements the POSIX message queues | ||
2003 | API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System | ||
2004 | Interfaces specification.) | ||
2005 | |||
2006 | The "mqueue" filesystem contains values for determining/setting the amount of | ||
2007 | resources used by the file system. | ||
2008 | |||
2009 | /proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the | ||
2010 | maximum number of message queues allowed on the system. | ||
2011 | |||
2012 | /proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the | ||
2013 | maximum number of messages in a queue value. In fact it is the limiting value | ||
2014 | for another (user) limit which is set in mq_open invocation. This attribute of | ||
2015 | a queue must be less or equal then msg_max. | ||
2016 | |||
2017 | /proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the | ||
2018 | maximum message size value (it is every message queue's attribute set during | ||
2019 | its creation). | ||
2020 | 1002 | ||
2021 | 2.12 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1003 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
2022 | ------------------------------------------------------ | 1004 | ------------------------------------------------------ |
2023 | 1005 | ||
2024 | This file can be used to adjust the score used to select which processes | 1006 | This file can be used to adjust the score used to select which processes |
@@ -2055,25 +1037,15 @@ The task with the highest badness score is then selected and its children | |||
2055 | are killed, process itself will be killed in an OOM situation when it does | 1037 | are killed, process itself will be killed in an OOM situation when it does |
2056 | not have children or some of them disabled oom like described above. | 1038 | not have children or some of them disabled oom like described above. |
2057 | 1039 | ||
2058 | 2.13 /proc/<pid>/oom_score - Display current oom-killer score | 1040 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
2059 | ------------------------------------------------------------- | 1041 | ------------------------------------------------------------- |
2060 | 1042 | ||
2061 | ------------------------------------------------------------------------------ | ||
2062 | This file can be used to check the current score used by the oom-killer is for | 1043 | This file can be used to check the current score used by the oom-killer is for |
2063 | any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which | 1044 | any given <pid>. Use it together with /proc/<pid>/oom_adj to tune which |
2064 | process should be killed in an out-of-memory situation. | 1045 | process should be killed in an out-of-memory situation. |
2065 | 1046 | ||
2066 | ------------------------------------------------------------------------------ | ||
2067 | Summary | ||
2068 | ------------------------------------------------------------------------------ | ||
2069 | Certain aspects of kernel behavior can be modified at runtime, without the | ||
2070 | need to recompile the kernel, or even to reboot the system. The files in the | ||
2071 | /proc/sys tree can not only be read, but also modified. You can use the echo | ||
2072 | command to write value into these files, thereby changing the default settings | ||
2073 | of the kernel. | ||
2074 | ------------------------------------------------------------------------------ | ||
2075 | 1047 | ||
2076 | 2.14 /proc/<pid>/io - Display the IO accounting fields | 1048 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
2077 | ------------------------------------------------------- | 1049 | ------------------------------------------------------- |
2078 | 1050 | ||
2079 | This file contains IO statistics for each running process | 1051 | This file contains IO statistics for each running process |
@@ -2175,7 +1147,7 @@ those 64-bit counters, process A could see an intermediate result. | |||
2175 | More information about this can be found within the taskstats documentation in | 1147 | More information about this can be found within the taskstats documentation in |
2176 | Documentation/accounting. | 1148 | Documentation/accounting. |
2177 | 1149 | ||
2178 | 2.15 /proc/<pid>/coredump_filter - Core dump filtering settings | 1150 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings |
2179 | --------------------------------------------------------------- | 1151 | --------------------------------------------------------------- |
2180 | When a process is dumped, all anonymous memory is written to a core file as | 1152 | When a process is dumped, all anonymous memory is written to a core file as |
2181 | long as the size of the core file isn't limited. But sometimes we don't want | 1153 | long as the size of the core file isn't limited. But sometimes we don't want |
@@ -2219,7 +1191,7 @@ For example: | |||
2219 | $ echo 0x7 > /proc/self/coredump_filter | 1191 | $ echo 0x7 > /proc/self/coredump_filter |
2220 | $ ./some_program | 1192 | $ ./some_program |
2221 | 1193 | ||
2222 | 2.16 /proc/<pid>/mountinfo - Information about mounts | 1194 | 3.5 /proc/<pid>/mountinfo - Information about mounts |
2223 | -------------------------------------------------------- | 1195 | -------------------------------------------------------- |
2224 | 1196 | ||
2225 | This file contains lines of the form: | 1197 | This file contains lines of the form: |
@@ -2256,30 +1228,3 @@ For more information on mount propagation see: | |||
2256 | 1228 | ||
2257 | Documentation/filesystems/sharedsubtree.txt | 1229 | Documentation/filesystems/sharedsubtree.txt |
2258 | 1230 | ||
2259 | 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface | ||
2260 | -------------------------------------------------------- | ||
2261 | |||
2262 | This directory contains configuration options for the epoll(7) interface. | ||
2263 | |||
2264 | max_user_instances | ||
2265 | ------------------ | ||
2266 | |||
2267 | This is the maximum number of epoll file descriptors that a single user can | ||
2268 | have open at a given time. The default value is 128, and should be enough | ||
2269 | for normal users. | ||
2270 | |||
2271 | max_user_watches | ||
2272 | ---------------- | ||
2273 | |||
2274 | Every epoll file descriptor can store a number of files to be monitored | ||
2275 | for event readiness. Each one of these monitored files constitutes a "watch". | ||
2276 | This configuration option sets the maximum number of "watches" that are | ||
2277 | allowed for each user. | ||
2278 | Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes | ||
2279 | on a 64bit one. | ||
2280 | The current default value for max_user_watches is the 1/32 of the available | ||
2281 | low memory, divided for the "watch" cost in bytes. | ||
2282 | |||
2283 | |||
2284 | ------------------------------------------------------------------------------ | ||
2285 | |||
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt index 3e79e4a7a392..b324c033035a 100644 --- a/Documentation/filesystems/squashfs.txt +++ b/Documentation/filesystems/squashfs.txt | |||
@@ -22,7 +22,7 @@ Squashfs filesystem features versus Cramfs: | |||
22 | 22 | ||
23 | Squashfs Cramfs | 23 | Squashfs Cramfs |
24 | 24 | ||
25 | Max filesystem size: 2^64 16 MiB | 25 | Max filesystem size: 2^64 256 MiB |
26 | Max file size: ~ 2 TiB 16 MiB | 26 | Max file size: ~ 2 TiB 16 MiB |
27 | Max files: unlimited unlimited | 27 | Max files: unlimited unlimited |
28 | Max directories: unlimited unlimited | 28 | Max directories: unlimited unlimited |
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 9f8740ca3f3b..26e4b8bc53ee 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt | |||
@@ -12,6 +12,7 @@ that support it. For example, a given bus might look like this: | |||
12 | | |-- enable | 12 | | |-- enable |
13 | | |-- irq | 13 | | |-- irq |
14 | | |-- local_cpus | 14 | | |-- local_cpus |
15 | | |-- remove | ||
15 | | |-- resource | 16 | | |-- resource |
16 | | |-- resource0 | 17 | | |-- resource0 |
17 | | |-- resource1 | 18 | | |-- resource1 |
@@ -36,6 +37,7 @@ files, each with their own function. | |||
36 | enable Whether the device is enabled (ascii, rw) | 37 | enable Whether the device is enabled (ascii, rw) |
37 | irq IRQ number (ascii, ro) | 38 | irq IRQ number (ascii, ro) |
38 | local_cpus nearby CPU mask (cpumask, ro) | 39 | local_cpus nearby CPU mask (cpumask, ro) |
40 | remove remove device from kernel's list (ascii, wo) | ||
39 | resource PCI resource host addresses (ascii, ro) | 41 | resource PCI resource host addresses (ascii, ro) |
40 | resource0..N PCI resource N, if present (binary, mmap) | 42 | resource0..N PCI resource N, if present (binary, mmap) |
41 | resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) | 43 | resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) |
@@ -46,6 +48,7 @@ files, each with their own function. | |||
46 | 48 | ||
47 | ro - read only file | 49 | ro - read only file |
48 | rw - file is readable and writable | 50 | rw - file is readable and writable |
51 | wo - write only file | ||
49 | mmap - file is mmapable | 52 | mmap - file is mmapable |
50 | ascii - file contains ascii text | 53 | ascii - file contains ascii text |
51 | binary - file contains binary data | 54 | binary - file contains binary data |
@@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully. | |||
73 | In the event a driver is not bound to the device, it can be enabled using the | 76 | In the event a driver is not bound to the device, it can be enabled using the |
74 | 'enable' file, documented above. | 77 | 'enable' file, documented above. |
75 | 78 | ||
79 | The 'remove' file is used to remove the PCI device, by writing a non-zero | ||
80 | integer to the file. This does not involve any kind of hot-plug functionality, | ||
81 | e.g. powering off the device. The device is removed from the kernel's list of | ||
82 | PCI devices, the sysfs directory for it is removed, and the device will be | ||
83 | removed from any drivers attached to it. Removal of PCI root buses is | ||
84 | disallowed. | ||
85 | |||
76 | Accessing legacy resources through sysfs | 86 | Accessing legacy resources through sysfs |
77 | ---------------------------------------- | 87 | ---------------------------------------- |
78 | 88 | ||
diff --git a/Documentation/filesystems/udf.txt b/Documentation/filesystems/udf.txt index fde829a756e6..902b95d0ee51 100644 --- a/Documentation/filesystems/udf.txt +++ b/Documentation/filesystems/udf.txt | |||
@@ -24,6 +24,8 @@ The following mount options are supported: | |||
24 | 24 | ||
25 | gid= Set the default group. | 25 | gid= Set the default group. |
26 | umask= Set the default umask. | 26 | umask= Set the default umask. |
27 | mode= Set the default file permissions. | ||
28 | dmode= Set the default directory permissions. | ||
27 | uid= Set the default user. | 29 | uid= Set the default user. |
28 | bs= Set the block size. | 30 | bs= Set the block size. |
29 | unhide Show otherwise hidden files. | 31 | unhide Show otherwise hidden files. |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index deeeed0faa8f..f49eecf2e573 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -277,8 +277,7 @@ or bottom half). | |||
277 | unfreeze_fs: called when VFS is unlocking a filesystem and making it writable | 277 | unfreeze_fs: called when VFS is unlocking a filesystem and making it writable |
278 | again. | 278 | again. |
279 | 279 | ||
280 | statfs: called when the VFS needs to get filesystem statistics. This | 280 | statfs: called when the VFS needs to get filesystem statistics. |
281 | is called with the kernel lock held | ||
282 | 281 | ||
283 | remount_fs: called when the filesystem is remounted. This is called | 282 | remount_fs: called when the filesystem is remounted. This is called |
284 | with the kernel lock held | 283 | with the kernel lock held |