diff options
author | Jeff Layton <jlayton@redhat.com> | 2017-07-24 06:22:16 -0400 |
---|---|---|
committer | Jeff Layton <jlayton@redhat.com> | 2017-07-29 09:01:02 -0400 |
commit | 80aafd50b6a4fa6b6bba4b451b553d5d221f59ff (patch) | |
tree | 573334b6917db9e00ecf9602b2f065cda75d70b8 | |
parent | 3acdfd280fe7d807237f2cb7a09d6f8f7f1b484f (diff) |
Documentation: add some docs for errseq_t
...and fix up a few comments in the code.
Signed-off-by: Jeff Layton <jlayton@redhat.com>
-rw-r--r-- | Documentation/errseq.rst | 149 | ||||
-rw-r--r-- | include/linux/errseq.h | 5 | ||||
-rw-r--r-- | include/linux/fs.h | 2 |
3 files changed, 152 insertions, 4 deletions
diff --git a/Documentation/errseq.rst b/Documentation/errseq.rst new file mode 100644 index 000000000000..4c29bd5afbc5 --- /dev/null +++ b/Documentation/errseq.rst | |||
@@ -0,0 +1,149 @@ | |||
1 | The errseq_t datatype | ||
2 | ===================== | ||
3 | An errseq_t is a way of recording errors in one place, and allowing any | ||
4 | number of "subscribers" to tell whether it has changed since a previous | ||
5 | point where it was sampled. | ||
6 | |||
7 | The initial use case for this is tracking errors for file | ||
8 | synchronization syscalls (fsync, fdatasync, msync and sync_file_range), | ||
9 | but it may be usable in other situations. | ||
10 | |||
11 | It's implemented as an unsigned 32-bit value. The low order bits are | ||
12 | designated to hold an error code (between 1 and MAX_ERRNO). The upper bits | ||
13 | are used as a counter. This is done with atomics instead of locking so that | ||
14 | these functions can be called from any context. | ||
15 | |||
16 | Note that there is a risk of collisions if new errors are being recorded | ||
17 | frequently, since we have so few bits to use as a counter. | ||
18 | |||
19 | To mitigate this, the bit between the error value and counter is used as | ||
20 | a flag to tell whether the value has been sampled since a new value was | ||
21 | recorded. That allows us to avoid bumping the counter if no one has | ||
22 | sampled it since the last time an error was recorded. | ||
23 | |||
24 | Thus we end up with a value that looks something like this:: | ||
25 | |||
26 | bit: 31..13 12 11..0 | ||
27 | +-----------------+----+----------------+ | ||
28 | | counter | SF | errno | | ||
29 | +-----------------+----+----------------+ | ||
30 | |||
31 | The general idea is for "watchers" to sample an errseq_t value and keep | ||
32 | it as a running cursor. That value can later be used to tell whether | ||
33 | any new errors have occurred since that sampling was done, and atomically | ||
34 | record the state at the time that it was checked. This allows us to | ||
35 | record errors in one place, and then have a number of "watchers" that | ||
36 | can tell whether the value has changed since they last checked it. | ||
37 | |||
38 | A new errseq_t should always be zeroed out. An errseq_t value of all zeroes | ||
39 | is the special (but common) case where there has never been an error. An all | ||
40 | zero value thus serves as the "epoch" if one wishes to know whether there | ||
41 | has ever been an error set since it was first initialized. | ||
42 | |||
43 | API usage | ||
44 | ========= | ||
45 | Let me tell you a story about a worker drone. Now, he's a good worker | ||
46 | overall, but the company is a little...management heavy. He has to | ||
47 | report to 77 supervisors today, and tomorrow the "big boss" is coming in | ||
48 | from out of town and he's sure to test the poor fellow too. | ||
49 | |||
50 | They're all handing him work to do -- so much he can't keep track of who | ||
51 | handed him what, but that's not really a big problem. The supervisors | ||
52 | just want to know when he's finished all of the work they've handed him so | ||
53 | far and whether he made any mistakes since they last asked. | ||
54 | |||
55 | He might have made the mistake on work they didn't actually hand him, | ||
56 | but he can't keep track of things at that level of detail, all he can | ||
57 | remember is the most recent mistake that he made. | ||
58 | |||
59 | Here's our worker_drone representation:: | ||
60 | |||
61 | struct worker_drone { | ||
62 | errseq_t wd_err; /* for recording errors */ | ||
63 | }; | ||
64 | |||
65 | Every day, the worker_drone starts out with a blank slate:: | ||
66 | |||
67 | struct worker_drone wd; | ||
68 | |||
69 | wd.wd_err = (errseq_t)0; | ||
70 | |||
71 | The supervisors come in and get an initial read for the day. They | ||
72 | don't care about anything that happened before their watch begins:: | ||
73 | |||
74 | struct supervisor { | ||
75 | errseq_t s_wd_err; /* private "cursor" for wd_err */ | ||
76 | spinlock_t s_wd_err_lock; /* protects s_wd_err */ | ||
77 | } | ||
78 | |||
79 | struct supervisor su; | ||
80 | |||
81 | su.s_wd_err = errseq_sample(&wd.wd_err); | ||
82 | spin_lock_init(&su.s_wd_err_lock); | ||
83 | |||
84 | Now they start handing him tasks to do. Every few minutes they ask him to | ||
85 | finish up all of the work they've handed him so far. Then they ask him | ||
86 | whether he made any mistakes on any of it:: | ||
87 | |||
88 | spin_lock(&su.su_wd_err_lock); | ||
89 | err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err); | ||
90 | spin_unlock(&su.su_wd_err_lock); | ||
91 | |||
92 | Up to this point, that just keeps returning 0. | ||
93 | |||
94 | Now, the owners of this company are quite miserly and have given him | ||
95 | substandard equipment with which to do his job. Occasionally it | ||
96 | glitches and he makes a mistake. He sighs a heavy sigh, and marks it | ||
97 | down:: | ||
98 | |||
99 | errseq_set(&wd.wd_err, -EIO); | ||
100 | |||
101 | ...and then gets back to work. The supervisors eventually poll again | ||
102 | and they each get the error when they next check. Subsequent calls will | ||
103 | return 0, until another error is recorded, at which point it's reported | ||
104 | to each of them once. | ||
105 | |||
106 | Note that the supervisors can't tell how many mistakes he made, only | ||
107 | whether one was made since they last checked, and the latest value | ||
108 | recorded. | ||
109 | |||
110 | Occasionally the big boss comes in for a spot check and asks the worker | ||
111 | to do a one-off job for him. He's not really watching the worker | ||
112 | full-time like the supervisors, but he does need to know whether a | ||
113 | mistake occurred while his job was processing. | ||
114 | |||
115 | He can just sample the current errseq_t in the worker, and then use that | ||
116 | to tell whether an error has occurred later:: | ||
117 | |||
118 | errseq_t since = errseq_sample(&wd.wd_err); | ||
119 | /* submit some work and wait for it to complete */ | ||
120 | err = errseq_check(&wd.wd_err, since); | ||
121 | |||
122 | Since he's just going to discard "since" after that point, he doesn't | ||
123 | need to advance it here. He also doesn't need any locking since it's | ||
124 | not usable by anyone else. | ||
125 | |||
126 | Serializing errseq_t cursor updates | ||
127 | =================================== | ||
128 | Note that the errseq_t API does not protect the errseq_t cursor during a | ||
129 | check_and_advance_operation. Only the canonical error code is handled | ||
130 | atomically. In a situation where more than one task might be using the | ||
131 | same errseq_t cursor at the same time, it's important to serialize | ||
132 | updates to that cursor. | ||
133 | |||
134 | If that's not done, then it's possible for the cursor to go backward | ||
135 | in which case the same error could be reported more than once. | ||
136 | |||
137 | Because of this, it's often advantageous to first do an errseq_check to | ||
138 | see if anything has changed, and only later do an | ||
139 | errseq_check_and_advance after taking the lock. e.g.:: | ||
140 | |||
141 | if (errseq_check(&wd.wd_err, READ_ONCE(su.s_wd_err)) { | ||
142 | /* su.s_wd_err is protected by s_wd_err_lock */ | ||
143 | spin_lock(&su.s_wd_err_lock); | ||
144 | err = errseq_check_and_advance(&wd.wd_err, &su.s_wd_err); | ||
145 | spin_unlock(&su.s_wd_err_lock); | ||
146 | } | ||
147 | |||
148 | That avoids the spinlock in the common case where nothing has changed | ||
149 | since the last time it was checked. | ||
diff --git a/include/linux/errseq.h b/include/linux/errseq.h index 784f0860527b..f746bd8fe4d0 100644 --- a/include/linux/errseq.h +++ b/include/linux/errseq.h | |||
@@ -1,8 +1,9 @@ | |||
1 | /* | ||
2 | * See Documentation/errseq.rst and lib/errseq.c | ||
3 | */ | ||
1 | #ifndef _LINUX_ERRSEQ_H | 4 | #ifndef _LINUX_ERRSEQ_H |
2 | #define _LINUX_ERRSEQ_H | 5 | #define _LINUX_ERRSEQ_H |
3 | 6 | ||
4 | /* See lib/errseq.c for more info */ | ||
5 | |||
6 | typedef u32 errseq_t; | 7 | typedef u32 errseq_t; |
7 | 8 | ||
8 | errseq_t errseq_set(errseq_t *eseq, int err); | 9 | errseq_t errseq_set(errseq_t *eseq, int err); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 7b5d6816542b..21e7df1ad613 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2571,8 +2571,6 @@ extern int __must_check file_write_and_wait_range(struct file *file, | |||
2571 | * When a writeback error occurs, most filesystems will want to call | 2571 | * When a writeback error occurs, most filesystems will want to call |
2572 | * filemap_set_wb_err to record the error in the mapping so that it will be | 2572 | * filemap_set_wb_err to record the error in the mapping so that it will be |
2573 | * automatically reported whenever fsync is called on the file. | 2573 | * automatically reported whenever fsync is called on the file. |
2574 | * | ||
2575 | * FIXME: mention FS_* flag here? | ||
2576 | */ | 2574 | */ |
2577 | static inline void filemap_set_wb_err(struct address_space *mapping, int err) | 2575 | static inline void filemap_set_wb_err(struct address_space *mapping, int err) |
2578 | { | 2576 | { |