diff options
Diffstat (limited to 'fs/xfs/libxfs/xfs_defer.c')
-rw-r--r-- | fs/xfs/libxfs/xfs_defer.c | 463 |
1 files changed, 463 insertions, 0 deletions
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c new file mode 100644 index 000000000000..054a2032fdb3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_defer.c | |||
@@ -0,0 +1,463 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2016 Oracle. All Rights Reserved. | ||
3 | * | ||
4 | * Author: Darrick J. Wong <darrick.wong@oracle.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version 2 | ||
9 | * of the License, or (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it would be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write the Free Software Foundation, | ||
18 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. | ||
19 | */ | ||
20 | #include "xfs.h" | ||
21 | #include "xfs_fs.h" | ||
22 | #include "xfs_shared.h" | ||
23 | #include "xfs_format.h" | ||
24 | #include "xfs_log_format.h" | ||
25 | #include "xfs_trans_resv.h" | ||
26 | #include "xfs_bit.h" | ||
27 | #include "xfs_sb.h" | ||
28 | #include "xfs_mount.h" | ||
29 | #include "xfs_defer.h" | ||
30 | #include "xfs_trans.h" | ||
31 | #include "xfs_trace.h" | ||
32 | |||
33 | /* | ||
34 | * Deferred Operations in XFS | ||
35 | * | ||
36 | * Due to the way locking rules work in XFS, certain transactions (block | ||
37 | * mapping and unmapping, typically) have permanent reservations so that | ||
38 | * we can roll the transaction to adhere to AG locking order rules and | ||
39 | * to unlock buffers between metadata updates. Prior to rmap/reflink, | ||
40 | * the mapping code had a mechanism to perform these deferrals for | ||
41 | * extents that were going to be freed; this code makes that facility | ||
42 | * more generic. | ||
43 | * | ||
44 | * When adding the reverse mapping and reflink features, it became | ||
45 | * necessary to perform complex remapping multi-transactions to comply | ||
46 | * with AG locking order rules, and to be able to spread a single | ||
47 | * refcount update operation (an operation on an n-block extent can | ||
48 | * update as many as n records!) among multiple transactions. XFS can | ||
49 | * roll a transaction to facilitate this, but using this facility | ||
50 | * requires us to log "intent" items in case log recovery needs to | ||
51 | * redo the operation, and to log "done" items to indicate that redo | ||
52 | * is not necessary. | ||
53 | * | ||
54 | * Deferred work is tracked in xfs_defer_pending items. Each pending | ||
55 | * item tracks one type of deferred work. Incoming work items (which | ||
56 | * have not yet had an intent logged) are attached to a pending item | ||
57 | * on the dop_intake list, where they wait for the caller to finish | ||
58 | * the deferred operations. | ||
59 | * | ||
60 | * Finishing a set of deferred operations is an involved process. To | ||
61 | * start, we define "rolling a deferred-op transaction" as follows: | ||
62 | * | ||
63 | * > For each xfs_defer_pending item on the dop_intake list, | ||
64 | * - Sort the work items in AG order. XFS locking | ||
65 | * order rules require us to lock buffers in AG order. | ||
66 | * - Create a log intent item for that type. | ||
67 | * - Attach it to the pending item. | ||
68 | * - Move the pending item from the dop_intake list to the | ||
69 | * dop_pending list. | ||
70 | * > Roll the transaction. | ||
71 | * | ||
72 | * NOTE: To avoid exceeding the transaction reservation, we limit the | ||
73 | * number of items that we attach to a given xfs_defer_pending. | ||
74 | * | ||
75 | * The actual finishing process looks like this: | ||
76 | * | ||
77 | * > For each xfs_defer_pending in the dop_pending list, | ||
78 | * - Roll the deferred-op transaction as above. | ||
79 | * - Create a log done item for that type, and attach it to the | ||
80 | * log intent item. | ||
81 | * - For each work item attached to the log intent item, | ||
82 | * * Perform the described action. | ||
83 | * * Attach the work item to the log done item. | ||
84 | * | ||
85 | * The key here is that we must log an intent item for all pending | ||
86 | * work items every time we roll the transaction, and that we must log | ||
87 | * a done item as soon as the work is completed. With this mechanism | ||
88 | * we can perform complex remapping operations, chaining intent items | ||
89 | * as needed. | ||
90 | * | ||
91 | * This is an example of remapping the extent (E, E+B) into file X at | ||
92 | * offset A and dealing with the extent (C, C+B) already being mapped | ||
93 | * there: | ||
94 | * +-------------------------------------------------+ | ||
95 | * | Unmap file X startblock C offset A length B | t0 | ||
96 | * | Intent to reduce refcount for extent (C, B) | | ||
97 | * | Intent to remove rmap (X, C, A, B) | | ||
98 | * | Intent to free extent (D, 1) (bmbt block) | | ||
99 | * | Intent to map (X, A, B) at startblock E | | ||
100 | * +-------------------------------------------------+ | ||
101 | * | Map file X startblock E offset A length B | t1 | ||
102 | * | Done mapping (X, E, A, B) | | ||
103 | * | Intent to increase refcount for extent (E, B) | | ||
104 | * | Intent to add rmap (X, E, A, B) | | ||
105 | * +-------------------------------------------------+ | ||
106 | * | Reduce refcount for extent (C, B) | t2 | ||
107 | * | Done reducing refcount for extent (C, B) | | ||
108 | * | Increase refcount for extent (E, B) | | ||
109 | * | Done increasing refcount for extent (E, B) | | ||
110 | * | Intent to free extent (C, B) | | ||
111 | * | Intent to free extent (F, 1) (refcountbt block) | | ||
112 | * | Intent to remove rmap (F, 1, REFC) | | ||
113 | * +-------------------------------------------------+ | ||
114 | * | Remove rmap (X, C, A, B) | t3 | ||
115 | * | Done removing rmap (X, C, A, B) | | ||
116 | * | Add rmap (X, E, A, B) | | ||
117 | * | Done adding rmap (X, E, A, B) | | ||
118 | * | Remove rmap (F, 1, REFC) | | ||
119 | * | Done removing rmap (F, 1, REFC) | | ||
120 | * +-------------------------------------------------+ | ||
121 | * | Free extent (C, B) | t4 | ||
122 | * | Done freeing extent (C, B) | | ||
123 | * | Free extent (D, 1) | | ||
124 | * | Done freeing extent (D, 1) | | ||
125 | * | Free extent (F, 1) | | ||
126 | * | Done freeing extent (F, 1) | | ||
127 | * +-------------------------------------------------+ | ||
128 | * | ||
129 | * If we should crash before t2 commits, log recovery replays | ||
130 | * the following intent items: | ||
131 | * | ||
132 | * - Intent to reduce refcount for extent (C, B) | ||
133 | * - Intent to remove rmap (X, C, A, B) | ||
134 | * - Intent to free extent (D, 1) (bmbt block) | ||
135 | * - Intent to increase refcount for extent (E, B) | ||
136 | * - Intent to add rmap (X, E, A, B) | ||
137 | * | ||
138 | * In the process of recovering, it should also generate and take care | ||
139 | * of these intent items: | ||
140 | * | ||
141 | * - Intent to free extent (C, B) | ||
142 | * - Intent to free extent (F, 1) (refcountbt block) | ||
143 | * - Intent to remove rmap (F, 1, REFC) | ||
144 | */ | ||
145 | |||
146 | static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; | ||
147 | |||
148 | /* | ||
149 | * For each pending item in the intake list, log its intent item and the | ||
150 | * associated extents, then add the entire intake list to the end of | ||
151 | * the pending list. | ||
152 | */ | ||
153 | STATIC void | ||
154 | xfs_defer_intake_work( | ||
155 | struct xfs_trans *tp, | ||
156 | struct xfs_defer_ops *dop) | ||
157 | { | ||
158 | struct list_head *li; | ||
159 | struct xfs_defer_pending *dfp; | ||
160 | |||
161 | list_for_each_entry(dfp, &dop->dop_intake, dfp_list) { | ||
162 | trace_xfs_defer_intake_work(tp->t_mountp, dfp); | ||
163 | dfp->dfp_intent = dfp->dfp_type->create_intent(tp, | ||
164 | dfp->dfp_count); | ||
165 | list_sort(tp->t_mountp, &dfp->dfp_work, | ||
166 | dfp->dfp_type->diff_items); | ||
167 | list_for_each(li, &dfp->dfp_work) | ||
168 | dfp->dfp_type->log_item(tp, dfp->dfp_intent, li); | ||
169 | } | ||
170 | |||
171 | list_splice_tail_init(&dop->dop_intake, &dop->dop_pending); | ||
172 | } | ||
173 | |||
174 | /* Abort all the intents that were committed. */ | ||
175 | STATIC void | ||
176 | xfs_defer_trans_abort( | ||
177 | struct xfs_trans *tp, | ||
178 | struct xfs_defer_ops *dop, | ||
179 | int error) | ||
180 | { | ||
181 | struct xfs_defer_pending *dfp; | ||
182 | |||
183 | trace_xfs_defer_trans_abort(tp->t_mountp, dop); | ||
184 | /* | ||
185 | * If the transaction was committed, drop the intent reference | ||
186 | * since we're bailing out of here. The other reference is | ||
187 | * dropped when the intent hits the AIL. If the transaction | ||
188 | * was not committed, the intent is freed by the intent item | ||
189 | * unlock handler on abort. | ||
190 | */ | ||
191 | if (!dop->dop_committed) | ||
192 | return; | ||
193 | |||
194 | /* Abort intent items. */ | ||
195 | list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { | ||
196 | trace_xfs_defer_pending_abort(tp->t_mountp, dfp); | ||
197 | if (dfp->dfp_committed) | ||
198 | dfp->dfp_type->abort_intent(dfp->dfp_intent); | ||
199 | } | ||
200 | |||
201 | /* Shut down FS. */ | ||
202 | xfs_force_shutdown(tp->t_mountp, (error == -EFSCORRUPTED) ? | ||
203 | SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); | ||
204 | } | ||
205 | |||
206 | /* Roll a transaction so we can do some deferred op processing. */ | ||
207 | STATIC int | ||
208 | xfs_defer_trans_roll( | ||
209 | struct xfs_trans **tp, | ||
210 | struct xfs_defer_ops *dop, | ||
211 | struct xfs_inode *ip) | ||
212 | { | ||
213 | int i; | ||
214 | int error; | ||
215 | |||
216 | /* Log all the joined inodes except the one we passed in. */ | ||
217 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { | ||
218 | if (dop->dop_inodes[i] == ip) | ||
219 | continue; | ||
220 | xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); | ||
221 | } | ||
222 | |||
223 | trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); | ||
224 | |||
225 | /* Roll the transaction. */ | ||
226 | error = xfs_trans_roll(tp, ip); | ||
227 | if (error) { | ||
228 | trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); | ||
229 | xfs_defer_trans_abort(*tp, dop, error); | ||
230 | return error; | ||
231 | } | ||
232 | dop->dop_committed = true; | ||
233 | |||
234 | /* Rejoin the joined inodes except the one we passed in. */ | ||
235 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { | ||
236 | if (dop->dop_inodes[i] == ip) | ||
237 | continue; | ||
238 | xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); | ||
239 | } | ||
240 | |||
241 | return error; | ||
242 | } | ||
243 | |||
244 | /* Do we have any work items to finish? */ | ||
245 | bool | ||
246 | xfs_defer_has_unfinished_work( | ||
247 | struct xfs_defer_ops *dop) | ||
248 | { | ||
249 | return !list_empty(&dop->dop_pending) || !list_empty(&dop->dop_intake); | ||
250 | } | ||
251 | |||
252 | /* | ||
253 | * Add this inode to the deferred op. Each joined inode is relogged | ||
254 | * each time we roll the transaction, in addition to any inode passed | ||
255 | * to xfs_defer_finish(). | ||
256 | */ | ||
257 | int | ||
258 | xfs_defer_join( | ||
259 | struct xfs_defer_ops *dop, | ||
260 | struct xfs_inode *ip) | ||
261 | { | ||
262 | int i; | ||
263 | |||
264 | for (i = 0; i < XFS_DEFER_OPS_NR_INODES; i++) { | ||
265 | if (dop->dop_inodes[i] == ip) | ||
266 | return 0; | ||
267 | else if (dop->dop_inodes[i] == NULL) { | ||
268 | dop->dop_inodes[i] = ip; | ||
269 | return 0; | ||
270 | } | ||
271 | } | ||
272 | |||
273 | return -EFSCORRUPTED; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Finish all the pending work. This involves logging intent items for | ||
278 | * any work items that wandered in since the last transaction roll (if | ||
279 | * one has even happened), rolling the transaction, and finishing the | ||
280 | * work items in the first item on the logged-and-pending list. | ||
281 | * | ||
282 | * If an inode is provided, relog it to the new transaction. | ||
283 | */ | ||
284 | int | ||
285 | xfs_defer_finish( | ||
286 | struct xfs_trans **tp, | ||
287 | struct xfs_defer_ops *dop, | ||
288 | struct xfs_inode *ip) | ||
289 | { | ||
290 | struct xfs_defer_pending *dfp; | ||
291 | struct list_head *li; | ||
292 | struct list_head *n; | ||
293 | void *done_item = NULL; | ||
294 | void *state; | ||
295 | int error = 0; | ||
296 | void (*cleanup_fn)(struct xfs_trans *, void *, int); | ||
297 | |||
298 | ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); | ||
299 | |||
300 | trace_xfs_defer_finish((*tp)->t_mountp, dop); | ||
301 | |||
302 | /* Until we run out of pending work to finish... */ | ||
303 | while (xfs_defer_has_unfinished_work(dop)) { | ||
304 | /* Log intents for work items sitting in the intake. */ | ||
305 | xfs_defer_intake_work(*tp, dop); | ||
306 | |||
307 | /* Roll the transaction. */ | ||
308 | error = xfs_defer_trans_roll(tp, dop, ip); | ||
309 | if (error) | ||
310 | goto out; | ||
311 | |||
312 | /* Mark all pending intents as committed. */ | ||
313 | list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) { | ||
314 | if (dfp->dfp_committed) | ||
315 | break; | ||
316 | trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp); | ||
317 | dfp->dfp_committed = true; | ||
318 | } | ||
319 | |||
320 | /* Log an intent-done item for the first pending item. */ | ||
321 | dfp = list_first_entry(&dop->dop_pending, | ||
322 | struct xfs_defer_pending, dfp_list); | ||
323 | trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); | ||
324 | done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, | ||
325 | dfp->dfp_count); | ||
326 | cleanup_fn = dfp->dfp_type->finish_cleanup; | ||
327 | |||
328 | /* Finish the work items. */ | ||
329 | state = NULL; | ||
330 | list_for_each_safe(li, n, &dfp->dfp_work) { | ||
331 | list_del(li); | ||
332 | dfp->dfp_count--; | ||
333 | error = dfp->dfp_type->finish_item(*tp, dop, li, | ||
334 | done_item, &state); | ||
335 | if (error) { | ||
336 | /* | ||
337 | * Clean up after ourselves and jump out. | ||
338 | * xfs_defer_cancel will take care of freeing | ||
339 | * all these lists and stuff. | ||
340 | */ | ||
341 | if (cleanup_fn) | ||
342 | cleanup_fn(*tp, state, error); | ||
343 | xfs_defer_trans_abort(*tp, dop, error); | ||
344 | goto out; | ||
345 | } | ||
346 | } | ||
347 | /* Done with the dfp, free it. */ | ||
348 | list_del(&dfp->dfp_list); | ||
349 | kmem_free(dfp); | ||
350 | |||
351 | if (cleanup_fn) | ||
352 | cleanup_fn(*tp, state, error); | ||
353 | } | ||
354 | |||
355 | out: | ||
356 | if (error) | ||
357 | trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); | ||
358 | else | ||
359 | trace_xfs_defer_finish_done((*tp)->t_mountp, dop); | ||
360 | return error; | ||
361 | } | ||
362 | |||
363 | /* | ||
364 | * Free up any items left in the list. | ||
365 | */ | ||
366 | void | ||
367 | xfs_defer_cancel( | ||
368 | struct xfs_defer_ops *dop) | ||
369 | { | ||
370 | struct xfs_defer_pending *dfp; | ||
371 | struct xfs_defer_pending *pli; | ||
372 | struct list_head *pwi; | ||
373 | struct list_head *n; | ||
374 | |||
375 | trace_xfs_defer_cancel(NULL, dop); | ||
376 | |||
377 | /* | ||
378 | * Free the pending items. Caller should already have arranged | ||
379 | * for the intent items to be released. | ||
380 | */ | ||
381 | list_for_each_entry_safe(dfp, pli, &dop->dop_intake, dfp_list) { | ||
382 | trace_xfs_defer_intake_cancel(NULL, dfp); | ||
383 | list_del(&dfp->dfp_list); | ||
384 | list_for_each_safe(pwi, n, &dfp->dfp_work) { | ||
385 | list_del(pwi); | ||
386 | dfp->dfp_count--; | ||
387 | dfp->dfp_type->cancel_item(pwi); | ||
388 | } | ||
389 | ASSERT(dfp->dfp_count == 0); | ||
390 | kmem_free(dfp); | ||
391 | } | ||
392 | list_for_each_entry_safe(dfp, pli, &dop->dop_pending, dfp_list) { | ||
393 | trace_xfs_defer_pending_cancel(NULL, dfp); | ||
394 | list_del(&dfp->dfp_list); | ||
395 | list_for_each_safe(pwi, n, &dfp->dfp_work) { | ||
396 | list_del(pwi); | ||
397 | dfp->dfp_count--; | ||
398 | dfp->dfp_type->cancel_item(pwi); | ||
399 | } | ||
400 | ASSERT(dfp->dfp_count == 0); | ||
401 | kmem_free(dfp); | ||
402 | } | ||
403 | } | ||
404 | |||
405 | /* Add an item for later deferred processing. */ | ||
406 | void | ||
407 | xfs_defer_add( | ||
408 | struct xfs_defer_ops *dop, | ||
409 | enum xfs_defer_ops_type type, | ||
410 | struct list_head *li) | ||
411 | { | ||
412 | struct xfs_defer_pending *dfp = NULL; | ||
413 | |||
414 | /* | ||
415 | * Add the item to a pending item at the end of the intake list. | ||
416 | * If the last pending item has the same type, reuse it. Else, | ||
417 | * create a new pending item at the end of the intake list. | ||
418 | */ | ||
419 | if (!list_empty(&dop->dop_intake)) { | ||
420 | dfp = list_last_entry(&dop->dop_intake, | ||
421 | struct xfs_defer_pending, dfp_list); | ||
422 | if (dfp->dfp_type->type != type || | ||
423 | (dfp->dfp_type->max_items && | ||
424 | dfp->dfp_count >= dfp->dfp_type->max_items)) | ||
425 | dfp = NULL; | ||
426 | } | ||
427 | if (!dfp) { | ||
428 | dfp = kmem_alloc(sizeof(struct xfs_defer_pending), | ||
429 | KM_SLEEP | KM_NOFS); | ||
430 | dfp->dfp_type = defer_op_types[type]; | ||
431 | dfp->dfp_committed = false; | ||
432 | dfp->dfp_intent = NULL; | ||
433 | dfp->dfp_count = 0; | ||
434 | INIT_LIST_HEAD(&dfp->dfp_work); | ||
435 | list_add_tail(&dfp->dfp_list, &dop->dop_intake); | ||
436 | } | ||
437 | |||
438 | list_add_tail(li, &dfp->dfp_work); | ||
439 | dfp->dfp_count++; | ||
440 | } | ||
441 | |||
442 | /* Initialize a deferred operation list. */ | ||
443 | void | ||
444 | xfs_defer_init_op_type( | ||
445 | const struct xfs_defer_op_type *type) | ||
446 | { | ||
447 | defer_op_types[type->type] = type; | ||
448 | } | ||
449 | |||
450 | /* Initialize a deferred operation. */ | ||
451 | void | ||
452 | xfs_defer_init( | ||
453 | struct xfs_defer_ops *dop, | ||
454 | xfs_fsblock_t *fbp) | ||
455 | { | ||
456 | dop->dop_committed = false; | ||
457 | dop->dop_low = false; | ||
458 | memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); | ||
459 | *fbp = NULLFSBLOCK; | ||
460 | INIT_LIST_HEAD(&dop->dop_intake); | ||
461 | INIT_LIST_HEAD(&dop->dop_pending); | ||
462 | trace_xfs_defer_init(NULL, dop); | ||
463 | } | ||