diff options
-rw-r--r-- | Documentation/filesystems/xfs-delayed-logging-design.txt | 816 | ||||
-rw-r--r-- | fs/xfs/Makefile | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 9 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_quotaops.c | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_super.c | 12 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_trace.h | 83 | ||||
-rw-r--r-- | fs/xfs/quota/xfs_dquot.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_ag.h | 24 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc.c | 357 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc.h | 7 | ||||
-rw-r--r-- | fs/xfs/xfs_alloc_btree.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 166 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.h | 18 | ||||
-rw-r--r-- | fs/xfs/xfs_error.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_log.c | 120 | ||||
-rw-r--r-- | fs/xfs/xfs_log.h | 14 | ||||
-rw-r--r-- | fs/xfs/xfs_log_cil.c | 725 | ||||
-rw-r--r-- | fs/xfs/xfs_log_priv.h | 118 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.c | 46 | ||||
-rw-r--r-- | fs/xfs/xfs_log_recover.h | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 1 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.c | 144 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.h | 44 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_buf.c | 46 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_item.c | 114 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_priv.h | 15 | ||||
-rw-r--r-- | fs/xfs/xfs_types.h | 2 |
27 files changed, 2382 insertions, 513 deletions
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.txt b/Documentation/filesystems/xfs-delayed-logging-design.txt new file mode 100644 index 000000000000..d8119e9d2d60 --- /dev/null +++ b/Documentation/filesystems/xfs-delayed-logging-design.txt | |||
@@ -0,0 +1,816 @@ | |||
1 | XFS Delayed Logging Design | ||
2 | -------------------------- | ||
3 | |||
4 | Introduction to Re-logging in XFS | ||
5 | --------------------------------- | ||
6 | |||
7 | XFS logging is a combination of logical and physical logging. Some objects, | ||
8 | such as inodes and dquots, are logged in logical format where the details | ||
9 | logged are made up of the changes to in-core structures rather than on-disk | ||
10 | structures. Other objects - typically buffers - have their physical changes | ||
11 | logged. The reason for these differences is to reduce the amount of log space | ||
12 | required for objects that are frequently logged. Some parts of inodes are more | ||
13 | frequently logged than others, and inodes are typically more frequently logged | ||
14 | than any other object (except maybe the superblock buffer) so keeping the | ||
15 | amount of metadata logged low is of prime importance. | ||
16 | |||
17 | The reason that this is such a concern is that XFS allows multiple separate | ||
18 | modifications to a single object to be carried in the log at any given time. | ||
19 | This allows the log to avoid needing to flush each change to disk before | ||
20 | recording a new change to the object. XFS does this via a method called | ||
21 | "re-logging". Conceptually, this is quite simple - all it requires is that any | ||
22 | new change to the object is recorded with a *new copy* of all the existing | ||
23 | changes in the new transaction that is written to the log. | ||
24 | |||
25 | That is, if we have a sequence of changes A through to F, and the object was | ||
26 | written to disk after change D, we would see in the log the following series | ||
27 | of transactions, their contents and the log sequence number (LSN) of the | ||
28 | transaction: | ||
29 | |||
30 | Transaction Contents LSN | ||
31 | A A X | ||
32 | B A+B X+n | ||
33 | C A+B+C X+n+m | ||
34 | D A+B+C+D X+n+m+o | ||
35 | <object written to disk> | ||
36 | E E Y (> X+n+m+o) | ||
37 | F E+F Yٍ+p | ||
38 | |||
39 | In other words, each time an object is relogged, the new transaction contains | ||
40 | the aggregation of all the previous changes currently held only in the log. | ||
41 | |||
42 | This relogging technique also allows objects to be moved forward in the log so | ||
43 | that an object being relogged does not prevent the tail of the log from ever | ||
44 | moving forward. This can be seen in the table above by the changing | ||
45 | (increasing) LSN of each subsquent transaction - the LSN is effectively a | ||
46 | direct encoding of the location in the log of the transaction. | ||
47 | |||
48 | This relogging is also used to implement long-running, multiple-commit | ||
49 | transactions. These transaction are known as rolling transactions, and require | ||
50 | a special log reservation known as a permanent transaction reservation. A | ||
51 | typical example of a rolling transaction is the removal of extents from an | ||
52 | inode which can only be done at a rate of two extents per transaction because | ||
53 | of reservation size limitations. Hence a rolling extent removal transaction | ||
54 | keeps relogging the inode and btree buffers as they get modified in each | ||
55 | removal operation. This keeps them moving forward in the log as the operation | ||
56 | progresses, ensuring that current operation never gets blocked by itself if the | ||
57 | log wraps around. | ||
58 | |||
59 | Hence it can be seen that the relogging operation is fundamental to the correct | ||
60 | working of the XFS journalling subsystem. From the above description, most | ||
61 | people should be able to see why the XFS metadata operations writes so much to | ||
62 | the log - repeated operations to the same objects write the same changes to | ||
63 | the log over and over again. Worse is the fact that objects tend to get | ||
64 | dirtier as they get relogged, so each subsequent transaction is writing more | ||
65 | metadata into the log. | ||
66 | |||
67 | Another feature of the XFS transaction subsystem is that most transactions are | ||
68 | asynchronous. That is, they don't commit to disk until either a log buffer is | ||
69 | filled (a log buffer can hold multiple transactions) or a synchronous operation | ||
70 | forces the log buffers holding the transactions to disk. This means that XFS is | ||
71 | doing aggregation of transactions in memory - batching them, if you like - to | ||
72 | minimise the impact of the log IO on transaction throughput. | ||
73 | |||
74 | The limitation on asynchronous transaction throughput is the number and size of | ||
75 | log buffers made available by the log manager. By default there are 8 log | ||
76 | buffers available and the size of each is 32kB - the size can be increased up | ||
77 | to 256kB by use of a mount option. | ||
78 | |||
79 | Effectively, this gives us the maximum bound of outstanding metadata changes | ||
80 | that can be made to the filesystem at any point in time - if all the log | ||
81 | buffers are full and under IO, then no more transactions can be committed until | ||
82 | the current batch completes. It is now common for a single current CPU core to | ||
83 | be to able to issue enough transactions to keep the log buffers full and under | ||
84 | IO permanently. Hence the XFS journalling subsystem can be considered to be IO | ||
85 | bound. | ||
86 | |||
87 | Delayed Logging: Concepts | ||
88 | ------------------------- | ||
89 | |||
90 | The key thing to note about the asynchronous logging combined with the | ||
91 | relogging technique XFS uses is that we can be relogging changed objects | ||
92 | multiple times before they are committed to disk in the log buffers. If we | ||
93 | return to the previous relogging example, it is entirely possible that | ||
94 | transactions A through D are committed to disk in the same log buffer. | ||
95 | |||
96 | That is, a single log buffer may contain multiple copies of the same object, | ||
97 | but only one of those copies needs to be there - the last one "D", as it | ||
98 | contains all the changes from the previous changes. In other words, we have one | ||
99 | necessary copy in the log buffer, and three stale copies that are simply | ||
100 | wasting space. When we are doing repeated operations on the same set of | ||
101 | objects, these "stale objects" can be over 90% of the space used in the log | ||
102 | buffers. It is clear that reducing the number of stale objects written to the | ||
103 | log would greatly reduce the amount of metadata we write to the log, and this | ||
104 | is the fundamental goal of delayed logging. | ||
105 | |||
106 | From a conceptual point of view, XFS is already doing relogging in memory (where | ||
107 | memory == log buffer), only it is doing it extremely inefficiently. It is using | ||
108 | logical to physical formatting to do the relogging because there is no | ||
109 | infrastructure to keep track of logical changes in memory prior to physically | ||
110 | formatting the changes in a transaction to the log buffer. Hence we cannot avoid | ||
111 | accumulating stale objects in the log buffers. | ||
112 | |||
113 | Delayed logging is the name we've given to keeping and tracking transactional | ||
114 | changes to objects in memory outside the log buffer infrastructure. Because of | ||
115 | the relogging concept fundamental to the XFS journalling subsystem, this is | ||
116 | actually relatively easy to do - all the changes to logged items are already | ||
117 | tracked in the current infrastructure. The big problem is how to accumulate | ||
118 | them and get them to the log in a consistent, recoverable manner. | ||
119 | Describing the problems and how they have been solved is the focus of this | ||
120 | document. | ||
121 | |||
122 | One of the key changes that delayed logging makes to the operation of the | ||
123 | journalling subsystem is that it disassociates the amount of outstanding | ||
124 | metadata changes from the size and number of log buffers available. In other | ||
125 | words, instead of there only being a maximum of 2MB of transaction changes not | ||
126 | written to the log at any point in time, there may be a much greater amount | ||
127 | being accumulated in memory. Hence the potential for loss of metadata on a | ||
128 | crash is much greater than for the existing logging mechanism. | ||
129 | |||
130 | It should be noted that this does not change the guarantee that log recovery | ||
131 | will result in a consistent filesystem. What it does mean is that as far as the | ||
132 | recovered filesystem is concerned, there may be many thousands of transactions | ||
133 | that simply did not occur as a result of the crash. This makes it even more | ||
134 | important that applications that care about their data use fsync() where they | ||
135 | need to ensure application level data integrity is maintained. | ||
136 | |||
137 | It should be noted that delayed logging is not an innovative new concept that | ||
138 | warrants rigorous proofs to determine whether it is correct or not. The method | ||
139 | of accumulating changes in memory for some period before writing them to the | ||
140 | log is used effectively in many filesystems including ext3 and ext4. Hence | ||
141 | no time is spent in this document trying to convince the reader that the | ||
142 | concept is sound. Instead it is simply considered a "solved problem" and as | ||
143 | such implementing it in XFS is purely an exercise in software engineering. | ||
144 | |||
145 | The fundamental requirements for delayed logging in XFS are simple: | ||
146 | |||
147 | 1. Reduce the amount of metadata written to the log by at least | ||
148 | an order of magnitude. | ||
149 | 2. Supply sufficient statistics to validate Requirement #1. | ||
150 | 3. Supply sufficient new tracing infrastructure to be able to debug | ||
151 | problems with the new code. | ||
152 | 4. No on-disk format change (metadata or log format). | ||
153 | 5. Enable and disable with a mount option. | ||
154 | 6. No performance regressions for synchronous transaction workloads. | ||
155 | |||
156 | Delayed Logging: Design | ||
157 | ----------------------- | ||
158 | |||
159 | Storing Changes | ||
160 | |||
161 | The problem with accumulating changes at a logical level (i.e. just using the | ||
162 | existing log item dirty region tracking) is that when it comes to writing the | ||
163 | changes to the log buffers, we need to ensure that the object we are formatting | ||
164 | is not changing while we do this. This requires locking the object to prevent | ||
165 | concurrent modification. Hence flushing the logical changes to the log would | ||
166 | require us to lock every object, format them, and then unlock them again. | ||
167 | |||
168 | This introduces lots of scope for deadlocks with transactions that are already | ||
169 | running. For example, a transaction has object A locked and modified, but needs | ||
170 | the delayed logging tracking lock to commit the transaction. However, the | ||
171 | flushing thread has the delayed logging tracking lock already held, and is | ||
172 | trying to get the lock on object A to flush it to the log buffer. This appears | ||
173 | to be an unsolvable deadlock condition, and it was solving this problem that | ||
174 | was the barrier to implementing delayed logging for so long. | ||
175 | |||
176 | The solution is relatively simple - it just took a long time to recognise it. | ||
177 | Put simply, the current logging code formats the changes to each item into an | ||
178 | vector array that points to the changed regions in the item. The log write code | ||
179 | simply copies the memory these vectors point to into the log buffer during | ||
180 | transaction commit while the item is locked in the transaction. Instead of | ||
181 | using the log buffer as the destination of the formatting code, we can use an | ||
182 | allocated memory buffer big enough to fit the formatted vector. | ||
183 | |||
184 | If we then copy the vector into the memory buffer and rewrite the vector to | ||
185 | point to the memory buffer rather than the object itself, we now have a copy of | ||
186 | the changes in a format that is compatible with the log buffer writing code. | ||
187 | that does not require us to lock the item to access. This formatting and | ||
188 | rewriting can all be done while the object is locked during transaction commit, | ||
189 | resulting in a vector that is transactionally consistent and can be accessed | ||
190 | without needing to lock the owning item. | ||
191 | |||
192 | Hence we avoid the need to lock items when we need to flush outstanding | ||
193 | asynchronous transactions to the log. The differences between the existing | ||
194 | formatting method and the delayed logging formatting can be seen in the | ||
195 | diagram below. | ||
196 | |||
197 | Current format log vector: | ||
198 | |||
199 | Object +---------------------------------------------+ | ||
200 | Vector 1 +----+ | ||
201 | Vector 2 +----+ | ||
202 | Vector 3 +----------+ | ||
203 | |||
204 | After formatting: | ||
205 | |||
206 | Log Buffer +-V1-+-V2-+----V3----+ | ||
207 | |||
208 | Delayed logging vector: | ||
209 | |||
210 | Object +---------------------------------------------+ | ||
211 | Vector 1 +----+ | ||
212 | Vector 2 +----+ | ||
213 | Vector 3 +----------+ | ||
214 | |||
215 | After formatting: | ||
216 | |||
217 | Memory Buffer +-V1-+-V2-+----V3----+ | ||
218 | Vector 1 +----+ | ||
219 | Vector 2 +----+ | ||
220 | Vector 3 +----------+ | ||
221 | |||
222 | The memory buffer and associated vector need to be passed as a single object, | ||
223 | but still need to be associated with the parent object so if the object is | ||
224 | relogged we can replace the current memory buffer with a new memory buffer that | ||
225 | contains the latest changes. | ||
226 | |||
227 | The reason for keeping the vector around after we've formatted the memory | ||
228 | buffer is to support splitting vectors across log buffer boundaries correctly. | ||
229 | If we don't keep the vector around, we do not know where the region boundaries | ||
230 | are in the item, so we'd need a new encapsulation method for regions in the log | ||
231 | buffer writing (i.e. double encapsulation). This would be an on-disk format | ||
232 | change and as such is not desirable. It also means we'd have to write the log | ||
233 | region headers in the formatting stage, which is problematic as there is per | ||
234 | region state that needs to be placed into the headers during the log write. | ||
235 | |||
236 | Hence we need to keep the vector, but by attaching the memory buffer to it and | ||
237 | rewriting the vector addresses to point at the memory buffer we end up with a | ||
238 | self-describing object that can be passed to the log buffer write code to be | ||
239 | handled in exactly the same manner as the existing log vectors are handled. | ||
240 | Hence we avoid needing a new on-disk format to handle items that have been | ||
241 | relogged in memory. | ||
242 | |||
243 | |||
244 | Tracking Changes | ||
245 | |||
246 | Now that we can record transactional changes in memory in a form that allows | ||
247 | them to be used without limitations, we need to be able to track and accumulate | ||
248 | them so that they can be written to the log at some later point in time. The | ||
249 | log item is the natural place to store this vector and buffer, and also makes sense | ||
250 | to be the object that is used to track committed objects as it will always | ||
251 | exist once the object has been included in a transaction. | ||
252 | |||
253 | The log item is already used to track the log items that have been written to | ||
254 | the log but not yet written to disk. Such log items are considered "active" | ||
255 | and as such are stored in the Active Item List (AIL) which is a LSN-ordered | ||
256 | double linked list. Items are inserted into this list during log buffer IO | ||
257 | completion, after which they are unpinned and can be written to disk. An object | ||
258 | that is in the AIL can be relogged, which causes the object to be pinned again | ||
259 | and then moved forward in the AIL when the log buffer IO completes for that | ||
260 | transaction. | ||
261 | |||
262 | Essentially, this shows that an item that is in the AIL can still be modified | ||
263 | and relogged, so any tracking must be separate to the AIL infrastructure. As | ||
264 | such, we cannot reuse the AIL list pointers for tracking committed items, nor | ||
265 | can we store state in any field that is protected by the AIL lock. Hence the | ||
266 | committed item tracking needs it's own locks, lists and state fields in the log | ||
267 | item. | ||
268 | |||
269 | Similar to the AIL, tracking of committed items is done through a new list | ||
270 | called the Committed Item List (CIL). The list tracks log items that have been | ||
271 | committed and have formatted memory buffers attached to them. It tracks objects | ||
272 | in transaction commit order, so when an object is relogged it is removed from | ||
273 | it's place in the list and re-inserted at the tail. This is entirely arbitrary | ||
274 | and done to make it easy for debugging - the last items in the list are the | ||
275 | ones that are most recently modified. Ordering of the CIL is not necessary for | ||
276 | transactional integrity (as discussed in the next section) so the ordering is | ||
277 | done for convenience/sanity of the developers. | ||
278 | |||
279 | |||
280 | Delayed Logging: Checkpoints | ||
281 | |||
282 | When we have a log synchronisation event, commonly known as a "log force", | ||
283 | all the items in the CIL must be written into the log via the log buffers. | ||
284 | We need to write these items in the order that they exist in the CIL, and they | ||
285 | need to be written as an atomic transaction. The need for all the objects to be | ||
286 | written as an atomic transaction comes from the requirements of relogging and | ||
287 | log replay - all the changes in all the objects in a given transaction must | ||
288 | either be completely replayed during log recovery, or not replayed at all. If | ||
289 | a transaction is not replayed because it is not complete in the log, then | ||
290 | no later transactions should be replayed, either. | ||
291 | |||
292 | To fulfill this requirement, we need to write the entire CIL in a single log | ||
293 | transaction. Fortunately, the XFS log code has no fixed limit on the size of a | ||
294 | transaction, nor does the log replay code. The only fundamental limit is that | ||
295 | the transaction cannot be larger than just under half the size of the log. The | ||
296 | reason for this limit is that to find the head and tail of the log, there must | ||
297 | be at least one complete transaction in the log at any given time. If a | ||
298 | transaction is larger than half the log, then there is the possibility that a | ||
299 | crash during the write of a such a transaction could partially overwrite the | ||
300 | only complete previous transaction in the log. This will result in a recovery | ||
301 | failure and an inconsistent filesystem and hence we must enforce the maximum | ||
302 | size of a checkpoint to be slightly less than a half the log. | ||
303 | |||
304 | Apart from this size requirement, a checkpoint transaction looks no different | ||
305 | to any other transaction - it contains a transaction header, a series of | ||
306 | formatted log items and a commit record at the tail. From a recovery | ||
307 | perspective, the checkpoint transaction is also no different - just a lot | ||
308 | bigger with a lot more items in it. The worst case effect of this is that we | ||
309 | might need to tune the recovery transaction object hash size. | ||
310 | |||
311 | Because the checkpoint is just another transaction and all the changes to log | ||
312 | items are stored as log vectors, we can use the existing log buffer writing | ||
313 | code to write the changes into the log. To do this efficiently, we need to | ||
314 | minimise the time we hold the CIL locked while writing the checkpoint | ||
315 | transaction. The current log write code enables us to do this easily with the | ||
316 | way it separates the writing of the transaction contents (the log vectors) from | ||
317 | the transaction commit record, but tracking this requires us to have a | ||
318 | per-checkpoint context that travels through the log write process through to | ||
319 | checkpoint completion. | ||
320 | |||
321 | Hence a checkpoint has a context that tracks the state of the current | ||
322 | checkpoint from initiation to checkpoint completion. A new context is initiated | ||
323 | at the same time a checkpoint transaction is started. That is, when we remove | ||
324 | all the current items from the CIL during a checkpoint operation, we move all | ||
325 | those changes into the current checkpoint context. We then initialise a new | ||
326 | context and attach that to the CIL for aggregation of new transactions. | ||
327 | |||
328 | This allows us to unlock the CIL immediately after transfer of all the | ||
329 | committed items and effectively allow new transactions to be issued while we | ||
330 | are formatting the checkpoint into the log. It also allows concurrent | ||
331 | checkpoints to be written into the log buffers in the case of log force heavy | ||
332 | workloads, just like the existing transaction commit code does. This, however, | ||
333 | requires that we strictly order the commit records in the log so that | ||
334 | checkpoint sequence order is maintained during log replay. | ||
335 | |||
336 | To ensure that we can be writing an item into a checkpoint transaction at | ||
337 | the same time another transaction modifies the item and inserts the log item | ||
338 | into the new CIL, then checkpoint transaction commit code cannot use log items | ||
339 | to store the list of log vectors that need to be written into the transaction. | ||
340 | Hence log vectors need to be able to be chained together to allow them to be | ||
341 | detatched from the log items. That is, when the CIL is flushed the memory | ||
342 | buffer and log vector attached to each log item needs to be attached to the | ||
343 | checkpoint context so that the log item can be released. In diagrammatic form, | ||
344 | the CIL would look like this before the flush: | ||
345 | |||
346 | CIL Head | ||
347 | | | ||
348 | V | ||
349 | Log Item <-> log vector 1 -> memory buffer | ||
350 | | -> vector array | ||
351 | V | ||
352 | Log Item <-> log vector 2 -> memory buffer | ||
353 | | -> vector array | ||
354 | V | ||
355 | ...... | ||
356 | | | ||
357 | V | ||
358 | Log Item <-> log vector N-1 -> memory buffer | ||
359 | | -> vector array | ||
360 | V | ||
361 | Log Item <-> log vector N -> memory buffer | ||
362 | -> vector array | ||
363 | |||
364 | And after the flush the CIL head is empty, and the checkpoint context log | ||
365 | vector list would look like: | ||
366 | |||
367 | Checkpoint Context | ||
368 | | | ||
369 | V | ||
370 | log vector 1 -> memory buffer | ||
371 | | -> vector array | ||
372 | | -> Log Item | ||
373 | V | ||
374 | log vector 2 -> memory buffer | ||
375 | | -> vector array | ||
376 | | -> Log Item | ||
377 | V | ||
378 | ...... | ||
379 | | | ||
380 | V | ||
381 | log vector N-1 -> memory buffer | ||
382 | | -> vector array | ||
383 | | -> Log Item | ||
384 | V | ||
385 | log vector N -> memory buffer | ||
386 | -> vector array | ||
387 | -> Log Item | ||
388 | |||
389 | Once this transfer is done, the CIL can be unlocked and new transactions can | ||
390 | start, while the checkpoint flush code works over the log vector chain to | ||
391 | commit the checkpoint. | ||
392 | |||
393 | Once the checkpoint is written into the log buffers, the checkpoint context is | ||
394 | attached to the log buffer that the commit record was written to along with a | ||
395 | completion callback. Log IO completion will call that callback, which can then | ||
396 | run transaction committed processing for the log items (i.e. insert into AIL | ||
397 | and unpin) in the log vector chain and then free the log vector chain and | ||
398 | checkpoint context. | ||
399 | |||
400 | Discussion Point: I am uncertain as to whether the log item is the most | ||
401 | efficient way to track vectors, even though it seems like the natural way to do | ||
402 | it. The fact that we walk the log items (in the CIL) just to chain the log | ||
403 | vectors and break the link between the log item and the log vector means that | ||
404 | we take a cache line hit for the log item list modification, then another for | ||
405 | the log vector chaining. If we track by the log vectors, then we only need to | ||
406 | break the link between the log item and the log vector, which means we should | ||
407 | dirty only the log item cachelines. Normally I wouldn't be concerned about one | ||
408 | vs two dirty cachelines except for the fact I've seen upwards of 80,000 log | ||
409 | vectors in one checkpoint transaction. I'd guess this is a "measure and | ||
410 | compare" situation that can be done after a working and reviewed implementation | ||
411 | is in the dev tree.... | ||
412 | |||
413 | Delayed Logging: Checkpoint Sequencing | ||
414 | |||
415 | One of the key aspects of the XFS transaction subsystem is that it tags | ||
416 | committed transactions with the log sequence number of the transaction commit. | ||
417 | This allows transactions to be issued asynchronously even though there may be | ||
418 | future operations that cannot be completed until that transaction is fully | ||
419 | committed to the log. In the rare case that a dependent operation occurs (e.g. | ||
420 | re-using a freed metadata extent for a data extent), a special, optimised log | ||
421 | force can be issued to force the dependent transaction to disk immediately. | ||
422 | |||
423 | To do this, transactions need to record the LSN of the commit record of the | ||
424 | transaction. This LSN comes directly from the log buffer the transaction is | ||
425 | written into. While this works just fine for the existing transaction | ||
426 | mechanism, it does not work for delayed logging because transactions are not | ||
427 | written directly into the log buffers. Hence some other method of sequencing | ||
428 | transactions is required. | ||
429 | |||
430 | As discussed in the checkpoint section, delayed logging uses per-checkpoint | ||
431 | contexts, and as such it is simple to assign a sequence number to each | ||
432 | checkpoint. Because the switching of checkpoint contexts must be done | ||
433 | atomically, it is simple to ensure that each new context has a monotonically | ||
434 | increasing sequence number assigned to it without the need for an external | ||
435 | atomic counter - we can just take the current context sequence number and add | ||
436 | one to it for the new context. | ||
437 | |||
438 | Then, instead of assigning a log buffer LSN to the transaction commit LSN | ||
439 | during the commit, we can assign the current checkpoint sequence. This allows | ||
440 | operations that track transactions that have not yet completed know what | ||
441 | checkpoint sequence needs to be committed before they can continue. As a | ||
442 | result, the code that forces the log to a specific LSN now needs to ensure that | ||
443 | the log forces to a specific checkpoint. | ||
444 | |||
445 | To ensure that we can do this, we need to track all the checkpoint contexts | ||
446 | that are currently committing to the log. When we flush a checkpoint, the | ||
447 | context gets added to a "committing" list which can be searched. When a | ||
448 | checkpoint commit completes, it is removed from the committing list. Because | ||
449 | the checkpoint context records the LSN of the commit record for the checkpoint, | ||
450 | we can also wait on the log buffer that contains the commit record, thereby | ||
451 | using the existing log force mechanisms to execute synchronous forces. | ||
452 | |||
453 | It should be noted that the synchronous forces may need to be extended with | ||
454 | mitigation algorithms similar to the current log buffer code to allow | ||
455 | aggregation of multiple synchronous transactions if there are already | ||
456 | synchronous transactions being flushed. Investigation of the performance of the | ||
457 | current design is needed before making any decisions here. | ||
458 | |||
459 | The main concern with log forces is to ensure that all the previous checkpoints | ||
460 | are also committed to disk before the one we need to wait for. Therefore we | ||
461 | need to check that all the prior contexts in the committing list are also | ||
462 | complete before waiting on the one we need to complete. We do this | ||
463 | synchronisation in the log force code so that we don't need to wait anywhere | ||
464 | else for such serialisation - it only matters when we do a log force. | ||
465 | |||
466 | The only remaining complexity is that a log force now also has to handle the | ||
467 | case where the forcing sequence number is the same as the current context. That | ||
468 | is, we need to flush the CIL and potentially wait for it to complete. This is a | ||
469 | simple addition to the existing log forcing code to check the sequence numbers | ||
470 | and push if required. Indeed, placing the current sequence checkpoint flush in | ||
471 | the log force code enables the current mechanism for issuing synchronous | ||
472 | transactions to remain untouched (i.e. commit an asynchronous transaction, then | ||
473 | force the log at the LSN of that transaction) and so the higher level code | ||
474 | behaves the same regardless of whether delayed logging is being used or not. | ||
475 | |||
476 | Delayed Logging: Checkpoint Log Space Accounting | ||
477 | |||
478 | The big issue for a checkpoint transaction is the log space reservation for the | ||
479 | transaction. We don't know how big a checkpoint transaction is going to be | ||
480 | ahead of time, nor how many log buffers it will take to write out, nor the | ||
481 | number of split log vector regions are going to be used. We can track the | ||
482 | amount of log space required as we add items to the commit item list, but we | ||
483 | still need to reserve the space in the log for the checkpoint. | ||
484 | |||
485 | A typical transaction reserves enough space in the log for the worst case space | ||
486 | usage of the transaction. The reservation accounts for log record headers, | ||
487 | transaction and region headers, headers for split regions, buffer tail padding, | ||
488 | etc. as well as the actual space for all the changed metadata in the | ||
489 | transaction. While some of this is fixed overhead, much of it is dependent on | ||
490 | the size of the transaction and the number of regions being logged (the number | ||
491 | of log vectors in the transaction). | ||
492 | |||
493 | An example of the differences would be logging directory changes versus logging | ||
494 | inode changes. If you modify lots of inode cores (e.g. chmod -R g+w *), then | ||
495 | there are lots of transactions that only contain an inode core and an inode log | ||
496 | format structure. That is, two vectors totaling roughly 150 bytes. If we modify | ||
497 | 10,000 inodes, we have about 1.5MB of metadata to write in 20,000 vectors. Each | ||
498 | vector is 12 bytes, so the total to be logged is approximately 1.75MB. In | ||
499 | comparison, if we are logging full directory buffers, they are typically 4KB | ||
500 | each, so we in 1.5MB of directory buffers we'd have roughly 400 buffers and a | ||
501 | buffer format structure for each buffer - roughly 800 vectors or 1.51MB total | ||
502 | space. From this, it should be obvious that a static log space reservation is | ||
503 | not particularly flexible and is difficult to select the "optimal value" for | ||
504 | all workloads. | ||
505 | |||
506 | Further, if we are going to use a static reservation, which bit of the entire | ||
507 | reservation does it cover? We account for space used by the transaction | ||
508 | reservation by tracking the space currently used by the object in the CIL and | ||
509 | then calculating the increase or decrease in space used as the object is | ||
510 | relogged. This allows for a checkpoint reservation to only have to account for | ||
511 | log buffer metadata used such as log header records. | ||
512 | |||
513 | However, even using a static reservation for just the log metadata is | ||
514 | problematic. Typically log record headers use at least 16KB of log space per | ||
515 | 1MB of log space consumed (512 bytes per 32k) and the reservation needs to be | ||
516 | large enough to handle arbitrary sized checkpoint transactions. This | ||
517 | reservation needs to be made before the checkpoint is started, and we need to | ||
518 | be able to reserve the space without sleeping. For a 8MB checkpoint, we need a | ||
519 | reservation of around 150KB, which is a non-trivial amount of space. | ||
520 | |||
521 | A static reservation needs to manipulate the log grant counters - we can take a | ||
522 | permanent reservation on the space, but we still need to make sure we refresh | ||
523 | the write reservation (the actual space available to the transaction) after | ||
524 | every checkpoint transaction completion. Unfortunately, if this space is not | ||
525 | available when required, then the regrant code will sleep waiting for it. | ||
526 | |||
527 | The problem with this is that it can lead to deadlocks as we may need to commit | ||
528 | checkpoints to be able to free up log space (refer back to the description of | ||
529 | rolling transactions for an example of this). Hence we *must* always have | ||
530 | space available in the log if we are to use static reservations, and that is | ||
531 | very difficult and complex to arrange. It is possible to do, but there is a | ||
532 | simpler way. | ||
533 | |||
534 | The simpler way of doing this is tracking the entire log space used by the | ||
535 | items in the CIL and using this to dynamically calculate the amount of log | ||
536 | space required by the log metadata. If this log metadata space changes as a | ||
537 | result of a transaction commit inserting a new memory buffer into the CIL, then | ||
538 | the difference in space required is removed from the transaction that causes | ||
539 | the change. Transactions at this level will *always* have enough space | ||
540 | available in their reservation for this as they have already reserved the | ||
541 | maximal amount of log metadata space they require, and such a delta reservation | ||
542 | will always be less than or equal to the maximal amount in the reservation. | ||
543 | |||
544 | Hence we can grow the checkpoint transaction reservation dynamically as items | ||
545 | are added to the CIL and avoid the need for reserving and regranting log space | ||
546 | up front. This avoids deadlocks and removes a blocking point from the | ||
547 | checkpoint flush code. | ||
548 | |||
549 | As mentioned early, transactions can't grow to more than half the size of the | ||
550 | log. Hence as part of the reservation growing, we need to also check the size | ||
551 | of the reservation against the maximum allowed transaction size. If we reach | ||
552 | the maximum threshold, we need to push the CIL to the log. This is effectively | ||
553 | a "background flush" and is done on demand. This is identical to | ||
554 | a CIL push triggered by a log force, only that there is no waiting for the | ||
555 | checkpoint commit to complete. This background push is checked and executed by | ||
556 | transaction commit code. | ||
557 | |||
558 | If the transaction subsystem goes idle while we still have items in the CIL, | ||
559 | they will be flushed by the periodic log force issued by the xfssyncd. This log | ||
560 | force will push the CIL to disk, and if the transaction subsystem stays idle, | ||
561 | allow the idle log to be covered (effectively marked clean) in exactly the same | ||
562 | manner that is done for the existing logging method. A discussion point is | ||
563 | whether this log force needs to be done more frequently than the current rate | ||
564 | which is once every 30s. | ||
565 | |||
566 | |||
567 | Delayed Logging: Log Item Pinning | ||
568 | |||
569 | Currently log items are pinned during transaction commit while the items are | ||
570 | still locked. This happens just after the items are formatted, though it could | ||
571 | be done any time before the items are unlocked. The result of this mechanism is | ||
572 | that items get pinned once for every transaction that is committed to the log | ||
573 | buffers. Hence items that are relogged in the log buffers will have a pin count | ||
574 | for every outstanding transaction they were dirtied in. When each of these | ||
575 | transactions is completed, they will unpin the item once. As a result, the item | ||
576 | only becomes unpinned when all the transactions complete and there are no | ||
577 | pending transactions. Thus the pinning and unpinning of a log item is symmetric | ||
578 | as there is a 1:1 relationship with transaction commit and log item completion. | ||
579 | |||
580 | For delayed logging, however, we have an assymetric transaction commit to | ||
581 | completion relationship. Every time an object is relogged in the CIL it goes | ||
582 | through the commit process without a corresponding completion being registered. | ||
583 | That is, we now have a many-to-one relationship between transaction commit and | ||
584 | log item completion. The result of this is that pinning and unpinning of the | ||
585 | log items becomes unbalanced if we retain the "pin on transaction commit, unpin | ||
586 | on transaction completion" model. | ||
587 | |||
588 | To keep pin/unpin symmetry, the algorithm needs to change to a "pin on | ||
589 | insertion into the CIL, unpin on checkpoint completion". In other words, the | ||
590 | pinning and unpinning becomes symmetric around a checkpoint context. We have to | ||
591 | pin the object the first time it is inserted into the CIL - if it is already in | ||
592 | the CIL during a transaction commit, then we do not pin it again. Because there | ||
593 | can be multiple outstanding checkpoint contexts, we can still see elevated pin | ||
594 | counts, but as each checkpoint completes the pin count will retain the correct | ||
595 | value according to it's context. | ||
596 | |||
597 | Just to make matters more slightly more complex, this checkpoint level context | ||
598 | for the pin count means that the pinning of an item must take place under the | ||
599 | CIL commit/flush lock. If we pin the object outside this lock, we cannot | ||
600 | guarantee which context the pin count is associated with. This is because of | ||
601 | the fact pinning the item is dependent on whether the item is present in the | ||
602 | current CIL or not. If we don't pin the CIL first before we check and pin the | ||
603 | object, we have a race with CIL being flushed between the check and the pin | ||
604 | (or not pinning, as the case may be). Hence we must hold the CIL flush/commit | ||
605 | lock to guarantee that we pin the items correctly. | ||
606 | |||
607 | Delayed Logging: Concurrent Scalability | ||
608 | |||
609 | A fundamental requirement for the CIL is that accesses through transaction | ||
610 | commits must scale to many concurrent commits. The current transaction commit | ||
611 | code does not break down even when there are transactions coming from 2048 | ||
612 | processors at once. The current transaction code does not go any faster than if | ||
613 | there was only one CPU using it, but it does not slow down either. | ||
614 | |||
615 | As a result, the delayed logging transaction commit code needs to be designed | ||
616 | for concurrency from the ground up. It is obvious that there are serialisation | ||
617 | points in the design - the three important ones are: | ||
618 | |||
619 | 1. Locking out new transaction commits while flushing the CIL | ||
620 | 2. Adding items to the CIL and updating item space accounting | ||
621 | 3. Checkpoint commit ordering | ||
622 | |||
623 | Looking at the transaction commit and CIL flushing interactions, it is clear | ||
624 | that we have a many-to-one interaction here. That is, the only restriction on | ||
625 | the number of concurrent transactions that can be trying to commit at once is | ||
626 | the amount of space available in the log for their reservations. The practical | ||
627 | limit here is in the order of several hundred concurrent transactions for a | ||
628 | 128MB log, which means that it is generally one per CPU in a machine. | ||
629 | |||
630 | The amount of time a transaction commit needs to hold out a flush is a | ||
631 | relatively long period of time - the pinning of log items needs to be done | ||
632 | while we are holding out a CIL flush, so at the moment that means it is held | ||
633 | across the formatting of the objects into memory buffers (i.e. while memcpy()s | ||
634 | are in progress). Ultimately a two pass algorithm where the formatting is done | ||
635 | separately to the pinning of objects could be used to reduce the hold time of | ||
636 | the transaction commit side. | ||
637 | |||
638 | Because of the number of potential transaction commit side holders, the lock | ||
639 | really needs to be a sleeping lock - if the CIL flush takes the lock, we do not | ||
640 | want every other CPU in the machine spinning on the CIL lock. Given that | ||
641 | flushing the CIL could involve walking a list of tens of thousands of log | ||
642 | items, it will get held for a significant time and so spin contention is a | ||
643 | significant concern. Preventing lots of CPUs spinning doing nothing is the | ||
644 | main reason for choosing a sleeping lock even though nothing in either the | ||
645 | transaction commit or CIL flush side sleeps with the lock held. | ||
646 | |||
647 | It should also be noted that CIL flushing is also a relatively rare operation | ||
648 | compared to transaction commit for asynchronous transaction workloads - only | ||
649 | time will tell if using a read-write semaphore for exclusion will limit | ||
650 | transaction commit concurrency due to cache line bouncing of the lock on the | ||
651 | read side. | ||
652 | |||
653 | The second serialisation point is on the transaction commit side where items | ||
654 | are inserted into the CIL. Because transactions can enter this code | ||
655 | concurrently, the CIL needs to be protected separately from the above | ||
656 | commit/flush exclusion. It also needs to be an exclusive lock but it is only | ||
657 | held for a very short time and so a spin lock is appropriate here. It is | ||
658 | possible that this lock will become a contention point, but given the short | ||
659 | hold time once per transaction I think that contention is unlikely. | ||
660 | |||
661 | The final serialisation point is the checkpoint commit record ordering code | ||
662 | that is run as part of the checkpoint commit and log force sequencing. The code | ||
663 | path that triggers a CIL flush (i.e. whatever triggers the log force) will enter | ||
664 | an ordering loop after writing all the log vectors into the log buffers but | ||
665 | before writing the commit record. This loop walks the list of committing | ||
666 | checkpoints and needs to block waiting for checkpoints to complete their commit | ||
667 | record write. As a result it needs a lock and a wait variable. Log force | ||
668 | sequencing also requires the same lock, list walk, and blocking mechanism to | ||
669 | ensure completion of checkpoints. | ||
670 | |||
671 | These two sequencing operations can use the mechanism even though the | ||
672 | events they are waiting for are different. The checkpoint commit record | ||
673 | sequencing needs to wait until checkpoint contexts contain a commit LSN | ||
674 | (obtained through completion of a commit record write) while log force | ||
675 | sequencing needs to wait until previous checkpoint contexts are removed from | ||
676 | the committing list (i.e. they've completed). A simple wait variable and | ||
677 | broadcast wakeups (thundering herds) has been used to implement these two | ||
678 | serialisation queues. They use the same lock as the CIL, too. If we see too | ||
679 | much contention on the CIL lock, or too many context switches as a result of | ||
680 | the broadcast wakeups these operations can be put under a new spinlock and | ||
681 | given separate wait lists to reduce lock contention and the number of processes | ||
682 | woken by the wrong event. | ||
683 | |||
684 | |||
685 | Lifecycle Changes | ||
686 | |||
687 | The existing log item life cycle is as follows: | ||
688 | |||
689 | 1. Transaction allocate | ||
690 | 2. Transaction reserve | ||
691 | 3. Lock item | ||
692 | 4. Join item to transaction | ||
693 | If not already attached, | ||
694 | Allocate log item | ||
695 | Attach log item to owner item | ||
696 | Attach log item to transaction | ||
697 | 5. Modify item | ||
698 | Record modifications in log item | ||
699 | 6. Transaction commit | ||
700 | Pin item in memory | ||
701 | Format item into log buffer | ||
702 | Write commit LSN into transaction | ||
703 | Unlock item | ||
704 | Attach transaction to log buffer | ||
705 | |||
706 | <log buffer IO dispatched> | ||
707 | <log buffer IO completes> | ||
708 | |||
709 | 7. Transaction completion | ||
710 | Mark log item committed | ||
711 | Insert log item into AIL | ||
712 | Write commit LSN into log item | ||
713 | Unpin log item | ||
714 | 8. AIL traversal | ||
715 | Lock item | ||
716 | Mark log item clean | ||
717 | Flush item to disk | ||
718 | |||
719 | <item IO completion> | ||
720 | |||
721 | 9. Log item removed from AIL | ||
722 | Moves log tail | ||
723 | Item unlocked | ||
724 | |||
725 | Essentially, steps 1-6 operate independently from step 7, which is also | ||
726 | independent of steps 8-9. An item can be locked in steps 1-6 or steps 8-9 | ||
727 | at the same time step 7 is occurring, but only steps 1-6 or 8-9 can occur | ||
728 | at the same time. If the log item is in the AIL or between steps 6 and 7 | ||
729 | and steps 1-6 are re-entered, then the item is relogged. Only when steps 8-9 | ||
730 | are entered and completed is the object considered clean. | ||
731 | |||
732 | With delayed logging, there are new steps inserted into the life cycle: | ||
733 | |||
734 | 1. Transaction allocate | ||
735 | 2. Transaction reserve | ||
736 | 3. Lock item | ||
737 | 4. Join item to transaction | ||
738 | If not already attached, | ||
739 | Allocate log item | ||
740 | Attach log item to owner item | ||
741 | Attach log item to transaction | ||
742 | 5. Modify item | ||
743 | Record modifications in log item | ||
744 | 6. Transaction commit | ||
745 | Pin item in memory if not pinned in CIL | ||
746 | Format item into log vector + buffer | ||
747 | Attach log vector and buffer to log item | ||
748 | Insert log item into CIL | ||
749 | Write CIL context sequence into transaction | ||
750 | Unlock item | ||
751 | |||
752 | <next log force> | ||
753 | |||
754 | 7. CIL push | ||
755 | lock CIL flush | ||
756 | Chain log vectors and buffers together | ||
757 | Remove items from CIL | ||
758 | unlock CIL flush | ||
759 | write log vectors into log | ||
760 | sequence commit records | ||
761 | attach checkpoint context to log buffer | ||
762 | |||
763 | <log buffer IO dispatched> | ||
764 | <log buffer IO completes> | ||
765 | |||
766 | 8. Checkpoint completion | ||
767 | Mark log item committed | ||
768 | Insert item into AIL | ||
769 | Write commit LSN into log item | ||
770 | Unpin log item | ||
771 | 9. AIL traversal | ||
772 | Lock item | ||
773 | Mark log item clean | ||
774 | Flush item to disk | ||
775 | <item IO completion> | ||
776 | 10. Log item removed from AIL | ||
777 | Moves log tail | ||
778 | Item unlocked | ||
779 | |||
780 | From this, it can be seen that the only life cycle differences between the two | ||
781 | logging methods are in the middle of the life cycle - they still have the same | ||
782 | beginning and end and execution constraints. The only differences are in the | ||
783 | commiting of the log items to the log itself and the completion processing. | ||
784 | Hence delayed logging should not introduce any constraints on log item | ||
785 | behaviour, allocation or freeing that don't already exist. | ||
786 | |||
787 | As a result of this zero-impact "insertion" of delayed logging infrastructure | ||
788 | and the design of the internal structures to avoid on disk format changes, we | ||
789 | can basically switch between delayed logging and the existing mechanism with a | ||
790 | mount option. Fundamentally, there is no reason why the log manager would not | ||
791 | be able to swap methods automatically and transparently depending on load | ||
792 | characteristics, but this should not be necessary if delayed logging works as | ||
793 | designed. | ||
794 | |||
795 | Roadmap: | ||
796 | |||
797 | 2.6.35 Inclusion in mainline as an experimental mount option | ||
798 | => approximately 2-3 months to merge window | ||
799 | => needs to be in xfs-dev tree in 4-6 weeks | ||
800 | => code is nearing readiness for review | ||
801 | |||
802 | 2.6.37 Remove experimental tag from mount option | ||
803 | => should be roughly 6 months after initial merge | ||
804 | => enough time to: | ||
805 | => gain confidence and fix problems reported by early | ||
806 | adopters (a.k.a. guinea pigs) | ||
807 | => address worst performance regressions and undesired | ||
808 | behaviours | ||
809 | => start tuning/optimising code for parallelism | ||
810 | => start tuning/optimising algorithms consuming | ||
811 | excessive CPU time | ||
812 | |||
813 | 2.6.39 Switch default mount option to use delayed logging | ||
814 | => should be roughly 12 months after initial merge | ||
815 | => enough time to shake out remaining problems before next round of | ||
816 | enterprise distro kernel rebases | ||
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b4769e40e8bc..c8fb13f83b3f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile | |||
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \ | |||
77 | xfs_itable.o \ | 77 | xfs_itable.o \ |
78 | xfs_dfrag.o \ | 78 | xfs_dfrag.o \ |
79 | xfs_log.o \ | 79 | xfs_log.o \ |
80 | xfs_log_cil.o \ | ||
80 | xfs_log_recover.o \ | 81 | xfs_log_recover.o \ |
81 | xfs_mount.o \ | 82 | xfs_mount.o \ |
82 | xfs_mru_cache.o \ | 83 | xfs_mru_cache.o \ |
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f01de3c55c43..649ade8ef598 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -37,6 +37,7 @@ | |||
37 | 37 | ||
38 | #include "xfs_sb.h" | 38 | #include "xfs_sb.h" |
39 | #include "xfs_inum.h" | 39 | #include "xfs_inum.h" |
40 | #include "xfs_log.h" | ||
40 | #include "xfs_ag.h" | 41 | #include "xfs_ag.h" |
41 | #include "xfs_dmapi.h" | 42 | #include "xfs_dmapi.h" |
42 | #include "xfs_mount.h" | 43 | #include "xfs_mount.h" |
@@ -850,6 +851,12 @@ xfs_buf_lock_value( | |||
850 | * Note that this in no way locks the underlying pages, so it is only | 851 | * Note that this in no way locks the underlying pages, so it is only |
851 | * useful for synchronizing concurrent use of buffer objects, not for | 852 | * useful for synchronizing concurrent use of buffer objects, not for |
852 | * synchronizing independent access to the underlying pages. | 853 | * synchronizing independent access to the underlying pages. |
854 | * | ||
855 | * If we come across a stale, pinned, locked buffer, we know that we | ||
856 | * are being asked to lock a buffer that has been reallocated. Because | ||
857 | * it is pinned, we know that the log has not been pushed to disk and | ||
858 | * hence it will still be locked. Rather than sleeping until someone | ||
859 | * else pushes the log, push it ourselves before trying to get the lock. | ||
853 | */ | 860 | */ |
854 | void | 861 | void |
855 | xfs_buf_lock( | 862 | xfs_buf_lock( |
@@ -857,6 +864,8 @@ xfs_buf_lock( | |||
857 | { | 864 | { |
858 | trace_xfs_buf_lock(bp, _RET_IP_); | 865 | trace_xfs_buf_lock(bp, _RET_IP_); |
859 | 866 | ||
867 | if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) | ||
868 | xfs_log_force(bp->b_mount, 0); | ||
860 | if (atomic_read(&bp->b_io_remaining)) | 869 | if (atomic_read(&bp->b_io_remaining)) |
861 | blk_run_address_space(bp->b_target->bt_mapping); | 870 | blk_run_address_space(bp->b_target->bt_mapping); |
862 | down(&bp->b_sema); | 871 | down(&bp->b_sema); |
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index e31bf21fe5d3..9ac8aea91529 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include "xfs_dmapi.h" | 19 | #include "xfs_dmapi.h" |
20 | #include "xfs_sb.h" | 20 | #include "xfs_sb.h" |
21 | #include "xfs_inum.h" | 21 | #include "xfs_inum.h" |
22 | #include "xfs_log.h" | ||
22 | #include "xfs_ag.h" | 23 | #include "xfs_ag.h" |
23 | #include "xfs_mount.h" | 24 | #include "xfs_mount.h" |
24 | #include "xfs_quota.h" | 25 | #include "xfs_quota.h" |
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f24dbe5efde3..f2d1718c9165 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c | |||
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool; | |||
119 | #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ | 119 | #define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ |
120 | #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ | 120 | #define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ |
121 | #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ | 121 | #define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ |
122 | #define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ | ||
123 | #define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ | ||
122 | 124 | ||
123 | /* | 125 | /* |
124 | * Table driven mount option parser. | 126 | * Table driven mount option parser. |
@@ -374,6 +376,13 @@ xfs_parseargs( | |||
374 | mp->m_flags |= XFS_MOUNT_DMAPI; | 376 | mp->m_flags |= XFS_MOUNT_DMAPI; |
375 | } else if (!strcmp(this_char, MNTOPT_DMI)) { | 377 | } else if (!strcmp(this_char, MNTOPT_DMI)) { |
376 | mp->m_flags |= XFS_MOUNT_DMAPI; | 378 | mp->m_flags |= XFS_MOUNT_DMAPI; |
379 | } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { | ||
380 | mp->m_flags |= XFS_MOUNT_DELAYLOG; | ||
381 | cmn_err(CE_WARN, | ||
382 | "Enabling EXPERIMENTAL delayed logging feature " | ||
383 | "- use at your own risk.\n"); | ||
384 | } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { | ||
385 | mp->m_flags &= ~XFS_MOUNT_DELAYLOG; | ||
377 | } else if (!strcmp(this_char, "ihashsize")) { | 386 | } else if (!strcmp(this_char, "ihashsize")) { |
378 | cmn_err(CE_WARN, | 387 | cmn_err(CE_WARN, |
379 | "XFS: ihashsize no longer used, option is deprecated."); | 388 | "XFS: ihashsize no longer used, option is deprecated."); |
@@ -535,6 +544,7 @@ xfs_showargs( | |||
535 | { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, | 544 | { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, |
536 | { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, | 545 | { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, |
537 | { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, | 546 | { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, |
547 | { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, | ||
538 | { 0, NULL } | 548 | { 0, NULL } |
539 | }; | 549 | }; |
540 | static struct proc_xfs_info xfs_info_unset[] = { | 550 | static struct proc_xfs_info xfs_info_unset[] = { |
@@ -1755,7 +1765,7 @@ xfs_init_zones(void) | |||
1755 | * but it is much faster. | 1765 | * but it is much faster. |
1756 | */ | 1766 | */ |
1757 | xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + | 1767 | xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + |
1758 | (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / | 1768 | (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / |
1759 | NBWORD) * sizeof(int))), "xfs_buf_item"); | 1769 | NBWORD) * sizeof(int))), "xfs_buf_item"); |
1760 | if (!xfs_buf_item_zone) | 1770 | if (!xfs_buf_item_zone) |
1761 | goto out_destroy_trans_zone; | 1771 | goto out_destroy_trans_zone; |
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 8a319cfd2901..ff6bc797baf2 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h | |||
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap, | |||
1059 | 1059 | ||
1060 | ); | 1060 | ); |
1061 | 1061 | ||
1062 | #define XFS_BUSY_SYNC \ | ||
1063 | { 0, "async" }, \ | ||
1064 | { 1, "sync" } | ||
1065 | |||
1062 | TRACE_EVENT(xfs_alloc_busy, | 1066 | TRACE_EVENT(xfs_alloc_busy, |
1063 | TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, | 1067 | TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, |
1064 | xfs_extlen_t len, int slot), | 1068 | xfs_agblock_t agbno, xfs_extlen_t len, int sync), |
1065 | TP_ARGS(mp, agno, agbno, len, slot), | 1069 | TP_ARGS(trans, agno, agbno, len, sync), |
1066 | TP_STRUCT__entry( | 1070 | TP_STRUCT__entry( |
1067 | __field(dev_t, dev) | 1071 | __field(dev_t, dev) |
1072 | __field(struct xfs_trans *, tp) | ||
1073 | __field(int, tid) | ||
1068 | __field(xfs_agnumber_t, agno) | 1074 | __field(xfs_agnumber_t, agno) |
1069 | __field(xfs_agblock_t, agbno) | 1075 | __field(xfs_agblock_t, agbno) |
1070 | __field(xfs_extlen_t, len) | 1076 | __field(xfs_extlen_t, len) |
1071 | __field(int, slot) | 1077 | __field(int, sync) |
1072 | ), | 1078 | ), |
1073 | TP_fast_assign( | 1079 | TP_fast_assign( |
1074 | __entry->dev = mp->m_super->s_dev; | 1080 | __entry->dev = trans->t_mountp->m_super->s_dev; |
1081 | __entry->tp = trans; | ||
1082 | __entry->tid = trans->t_ticket->t_tid; | ||
1075 | __entry->agno = agno; | 1083 | __entry->agno = agno; |
1076 | __entry->agbno = agbno; | 1084 | __entry->agbno = agbno; |
1077 | __entry->len = len; | 1085 | __entry->len = len; |
1078 | __entry->slot = slot; | 1086 | __entry->sync = sync; |
1079 | ), | 1087 | ), |
1080 | TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", | 1088 | TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", |
1081 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1089 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1090 | __entry->tp, | ||
1091 | __entry->tid, | ||
1082 | __entry->agno, | 1092 | __entry->agno, |
1083 | __entry->agbno, | 1093 | __entry->agbno, |
1084 | __entry->len, | 1094 | __entry->len, |
1085 | __entry->slot) | 1095 | __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) |
1086 | 1096 | ||
1087 | ); | 1097 | ); |
1088 | 1098 | ||
1089 | #define XFS_BUSY_STATES \ | ||
1090 | { 0, "found" }, \ | ||
1091 | { 1, "missing" } | ||
1092 | |||
1093 | TRACE_EVENT(xfs_alloc_unbusy, | 1099 | TRACE_EVENT(xfs_alloc_unbusy, |
1094 | TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, | 1100 | TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, |
1095 | int slot, int found), | 1101 | xfs_agblock_t agbno, xfs_extlen_t len), |
1096 | TP_ARGS(mp, agno, slot, found), | 1102 | TP_ARGS(mp, agno, agbno, len), |
1097 | TP_STRUCT__entry( | 1103 | TP_STRUCT__entry( |
1098 | __field(dev_t, dev) | 1104 | __field(dev_t, dev) |
1099 | __field(xfs_agnumber_t, agno) | 1105 | __field(xfs_agnumber_t, agno) |
1100 | __field(int, slot) | 1106 | __field(xfs_agblock_t, agbno) |
1101 | __field(int, found) | 1107 | __field(xfs_extlen_t, len) |
1102 | ), | 1108 | ), |
1103 | TP_fast_assign( | 1109 | TP_fast_assign( |
1104 | __entry->dev = mp->m_super->s_dev; | 1110 | __entry->dev = mp->m_super->s_dev; |
1105 | __entry->agno = agno; | 1111 | __entry->agno = agno; |
1106 | __entry->slot = slot; | 1112 | __entry->agbno = agbno; |
1107 | __entry->found = found; | 1113 | __entry->len = len; |
1108 | ), | 1114 | ), |
1109 | TP_printk("dev %d:%d agno %u slot %d %s", | 1115 | TP_printk("dev %d:%d agno %u agbno %u len %u", |
1110 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1116 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1111 | __entry->agno, | 1117 | __entry->agno, |
1112 | __entry->slot, | 1118 | __entry->agbno, |
1113 | __print_symbolic(__entry->found, XFS_BUSY_STATES)) | 1119 | __entry->len) |
1114 | ); | 1120 | ); |
1115 | 1121 | ||
1122 | #define XFS_BUSY_STATES \ | ||
1123 | { 0, "missing" }, \ | ||
1124 | { 1, "found" } | ||
1125 | |||
1116 | TRACE_EVENT(xfs_alloc_busysearch, | 1126 | TRACE_EVENT(xfs_alloc_busysearch, |
1117 | TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, | 1127 | TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, |
1118 | xfs_extlen_t len, xfs_lsn_t lsn), | 1128 | xfs_agblock_t agbno, xfs_extlen_t len, int found), |
1119 | TP_ARGS(mp, agno, agbno, len, lsn), | 1129 | TP_ARGS(mp, agno, agbno, len, found), |
1120 | TP_STRUCT__entry( | 1130 | TP_STRUCT__entry( |
1121 | __field(dev_t, dev) | 1131 | __field(dev_t, dev) |
1122 | __field(xfs_agnumber_t, agno) | 1132 | __field(xfs_agnumber_t, agno) |
1123 | __field(xfs_agblock_t, agbno) | 1133 | __field(xfs_agblock_t, agbno) |
1124 | __field(xfs_extlen_t, len) | 1134 | __field(xfs_extlen_t, len) |
1125 | __field(xfs_lsn_t, lsn) | 1135 | __field(int, found) |
1126 | ), | 1136 | ), |
1127 | TP_fast_assign( | 1137 | TP_fast_assign( |
1128 | __entry->dev = mp->m_super->s_dev; | 1138 | __entry->dev = mp->m_super->s_dev; |
1129 | __entry->agno = agno; | 1139 | __entry->agno = agno; |
1130 | __entry->agbno = agbno; | 1140 | __entry->agbno = agbno; |
1131 | __entry->len = len; | 1141 | __entry->len = len; |
1132 | __entry->lsn = lsn; | 1142 | __entry->found = found; |
1133 | ), | 1143 | ), |
1134 | TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", | 1144 | TP_printk("dev %d:%d agno %u agbno %u len %u %s", |
1135 | MAJOR(__entry->dev), MINOR(__entry->dev), | 1145 | MAJOR(__entry->dev), MINOR(__entry->dev), |
1136 | __entry->agno, | 1146 | __entry->agno, |
1137 | __entry->agbno, | 1147 | __entry->agbno, |
1138 | __entry->len, | 1148 | __entry->len, |
1149 | __print_symbolic(__entry->found, XFS_BUSY_STATES)) | ||
1150 | ); | ||
1151 | |||
1152 | TRACE_EVENT(xfs_trans_commit_lsn, | ||
1153 | TP_PROTO(struct xfs_trans *trans), | ||
1154 | TP_ARGS(trans), | ||
1155 | TP_STRUCT__entry( | ||
1156 | __field(dev_t, dev) | ||
1157 | __field(struct xfs_trans *, tp) | ||
1158 | __field(xfs_lsn_t, lsn) | ||
1159 | ), | ||
1160 | TP_fast_assign( | ||
1161 | __entry->dev = trans->t_mountp->m_super->s_dev; | ||
1162 | __entry->tp = trans; | ||
1163 | __entry->lsn = trans->t_commit_lsn; | ||
1164 | ), | ||
1165 | TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", | ||
1166 | MAJOR(__entry->dev), MINOR(__entry->dev), | ||
1167 | __entry->tp, | ||
1139 | __entry->lsn) | 1168 | __entry->lsn) |
1140 | ); | 1169 | ); |
1141 | 1170 | ||
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index b89ec5df0129..585e7633dfc7 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c | |||
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk( | |||
344 | for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) | 344 | for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) |
345 | xfs_qm_dqinit_core(curid, type, d); | 345 | xfs_qm_dqinit_core(curid, type, d); |
346 | xfs_trans_dquot_buf(tp, bp, | 346 | xfs_trans_dquot_buf(tp, bp, |
347 | (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : | 347 | (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : |
348 | ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : | 348 | ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : |
349 | XFS_BLI_GDQUOT_BUF))); | 349 | XFS_BLF_GDQUOT_BUF))); |
350 | xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); | 350 | xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); |
351 | } | 351 | } |
352 | 352 | ||
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index abb8222b88c9..401f364ad36c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h | |||
@@ -175,14 +175,20 @@ typedef struct xfs_agfl { | |||
175 | } xfs_agfl_t; | 175 | } xfs_agfl_t; |
176 | 176 | ||
177 | /* | 177 | /* |
178 | * Busy block/extent entry. Used in perag to mark blocks that have been freed | 178 | * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that |
179 | * but whose transactions aren't committed to disk yet. | 179 | * have been freed but whose transactions aren't committed to disk yet. |
180 | * | ||
181 | * Note that we use the transaction ID to record the transaction, not the | ||
182 | * transaction structure itself. See xfs_alloc_busy_insert() for details. | ||
180 | */ | 183 | */ |
181 | typedef struct xfs_perag_busy { | 184 | struct xfs_busy_extent { |
182 | xfs_agblock_t busy_start; | 185 | struct rb_node rb_node; /* ag by-bno indexed search tree */ |
183 | xfs_extlen_t busy_length; | 186 | struct list_head list; /* transaction busy extent list */ |
184 | struct xfs_trans *busy_tp; /* transaction that did the free */ | 187 | xfs_agnumber_t agno; |
185 | } xfs_perag_busy_t; | 188 | xfs_agblock_t bno; |
189 | xfs_extlen_t length; | ||
190 | xlog_tid_t tid; /* transaction that created this */ | ||
191 | }; | ||
186 | 192 | ||
187 | /* | 193 | /* |
188 | * Per-ag incore structure, copies of information in agf and agi, | 194 | * Per-ag incore structure, copies of information in agf and agi, |
@@ -216,7 +222,8 @@ typedef struct xfs_perag { | |||
216 | xfs_agino_t pagl_leftrec; | 222 | xfs_agino_t pagl_leftrec; |
217 | xfs_agino_t pagl_rightrec; | 223 | xfs_agino_t pagl_rightrec; |
218 | #ifdef __KERNEL__ | 224 | #ifdef __KERNEL__ |
219 | spinlock_t pagb_lock; /* lock for pagb_list */ | 225 | spinlock_t pagb_lock; /* lock for pagb_tree */ |
226 | struct rb_root pagb_tree; /* ordered tree of busy extents */ | ||
220 | 227 | ||
221 | atomic_t pagf_fstrms; /* # of filestreams active in this AG */ | 228 | atomic_t pagf_fstrms; /* # of filestreams active in this AG */ |
222 | 229 | ||
@@ -226,7 +233,6 @@ typedef struct xfs_perag { | |||
226 | int pag_ici_reclaimable; /* reclaimable inodes */ | 233 | int pag_ici_reclaimable; /* reclaimable inodes */ |
227 | #endif | 234 | #endif |
228 | int pagb_count; /* pagb slots in use */ | 235 | int pagb_count; /* pagb slots in use */ |
229 | xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ | ||
230 | } xfs_perag_t; | 236 | } xfs_perag_t; |
231 | 237 | ||
232 | /* | 238 | /* |
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 94cddbfb2560..a7fbe8a99b12 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c | |||
@@ -46,11 +46,9 @@ | |||
46 | #define XFSA_FIXUP_BNO_OK 1 | 46 | #define XFSA_FIXUP_BNO_OK 1 |
47 | #define XFSA_FIXUP_CNT_OK 2 | 47 | #define XFSA_FIXUP_CNT_OK 2 |
48 | 48 | ||
49 | STATIC void | 49 | static int |
50 | xfs_alloc_search_busy(xfs_trans_t *tp, | 50 | xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, |
51 | xfs_agnumber_t agno, | 51 | xfs_agblock_t bno, xfs_extlen_t len); |
52 | xfs_agblock_t bno, | ||
53 | xfs_extlen_t len); | ||
54 | 52 | ||
55 | /* | 53 | /* |
56 | * Prototypes for per-ag allocation routines | 54 | * Prototypes for per-ag allocation routines |
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( | |||
540 | be32_to_cpu(agf->agf_length)); | 538 | be32_to_cpu(agf->agf_length)); |
541 | xfs_alloc_log_agf(args->tp, args->agbp, | 539 | xfs_alloc_log_agf(args->tp, args->agbp, |
542 | XFS_AGF_FREEBLKS); | 540 | XFS_AGF_FREEBLKS); |
543 | /* search the busylist for these blocks */ | 541 | /* |
544 | xfs_alloc_search_busy(args->tp, args->agno, | 542 | * Search the busylist for these blocks and mark the |
545 | args->agbno, args->len); | 543 | * transaction as synchronous if blocks are found. This |
544 | * avoids the need to block due to a synchronous log | ||
545 | * force to ensure correct ordering as the synchronous | ||
546 | * transaction will guarantee that for us. | ||
547 | */ | ||
548 | if (xfs_alloc_busy_search(args->mp, args->agno, | ||
549 | args->agbno, args->len)) | ||
550 | xfs_trans_set_sync(args->tp); | ||
546 | } | 551 | } |
547 | if (!args->isfl) | 552 | if (!args->isfl) |
548 | xfs_trans_mod_sb(args->tp, | 553 | xfs_trans_mod_sb(args->tp, |
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent( | |||
1693 | * when the iclog commits to disk. If a busy block is allocated, | 1698 | * when the iclog commits to disk. If a busy block is allocated, |
1694 | * the iclog is pushed up to the LSN that freed the block. | 1699 | * the iclog is pushed up to the LSN that freed the block. |
1695 | */ | 1700 | */ |
1696 | xfs_alloc_mark_busy(tp, agno, bno, len); | 1701 | xfs_alloc_busy_insert(tp, agno, bno, len); |
1697 | return 0; | 1702 | return 0; |
1698 | 1703 | ||
1699 | error0: | 1704 | error0: |
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist( | |||
1989 | *bnop = bno; | 1994 | *bnop = bno; |
1990 | 1995 | ||
1991 | /* | 1996 | /* |
1992 | * As blocks are freed, they are added to the per-ag busy list | 1997 | * As blocks are freed, they are added to the per-ag busy list and |
1993 | * and remain there until the freeing transaction is committed to | 1998 | * remain there until the freeing transaction is committed to disk. |
1994 | * disk. Now that we have allocated blocks, this list must be | 1999 | * Now that we have allocated blocks, this list must be searched to see |
1995 | * searched to see if a block is being reused. If one is, then | 2000 | * if a block is being reused. If one is, then the freeing transaction |
1996 | * the freeing transaction must be pushed to disk NOW by forcing | 2001 | * must be pushed to disk before this transaction. |
1997 | * to disk all iclogs up that transaction's LSN. | 2002 | * |
2003 | * We do this by setting the current transaction to a sync transaction | ||
2004 | * which guarantees that the freeing transaction is on disk before this | ||
2005 | * transaction. This is done instead of a synchronous log force here so | ||
2006 | * that we don't sit and wait with the AGF locked in the transaction | ||
2007 | * during the log force. | ||
1998 | */ | 2008 | */ |
1999 | xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); | 2009 | if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) |
2010 | xfs_trans_set_sync(tp); | ||
2000 | return 0; | 2011 | return 0; |
2001 | } | 2012 | } |
2002 | 2013 | ||
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf( | |||
2201 | be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); | 2212 | be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); |
2202 | spin_lock_init(&pag->pagb_lock); | 2213 | spin_lock_init(&pag->pagb_lock); |
2203 | pag->pagb_count = 0; | 2214 | pag->pagb_count = 0; |
2204 | memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); | 2215 | pag->pagb_tree = RB_ROOT; |
2205 | pag->pagf_init = 1; | 2216 | pag->pagf_init = 1; |
2206 | } | 2217 | } |
2207 | #ifdef DEBUG | 2218 | #ifdef DEBUG |
@@ -2479,127 +2490,263 @@ error0: | |||
2479 | * list is reused, the transaction that freed it must be forced to disk | 2490 | * list is reused, the transaction that freed it must be forced to disk |
2480 | * before continuing to use the block. | 2491 | * before continuing to use the block. |
2481 | * | 2492 | * |
2482 | * xfs_alloc_mark_busy - add to the per-ag busy list | 2493 | * xfs_alloc_busy_insert - add to the per-ag busy list |
2483 | * xfs_alloc_clear_busy - remove an item from the per-ag busy list | 2494 | * xfs_alloc_busy_clear - remove an item from the per-ag busy list |
2495 | * xfs_alloc_busy_search - search for a busy extent | ||
2496 | */ | ||
2497 | |||
2498 | /* | ||
2499 | * Insert a new extent into the busy tree. | ||
2500 | * | ||
2501 | * The busy extent tree is indexed by the start block of the busy extent. | ||
2502 | * there can be multiple overlapping ranges in the busy extent tree but only | ||
2503 | * ever one entry at a given start block. The reason for this is that | ||
2504 | * multi-block extents can be freed, then smaller chunks of that extent | ||
2505 | * allocated and freed again before the first transaction commit is on disk. | ||
2506 | * If the exact same start block is freed a second time, we have to wait for | ||
2507 | * that busy extent to pass out of the tree before the new extent is inserted. | ||
2508 | * There are two main cases we have to handle here. | ||
2509 | * | ||
2510 | * The first case is a transaction that triggers a "free - allocate - free" | ||
2511 | * cycle. This can occur during btree manipulations as a btree block is freed | ||
2512 | * to the freelist, then allocated from the free list, then freed again. In | ||
2513 | * this case, the second extxpnet free is what triggers the duplicate and as | ||
2514 | * such the transaction IDs should match. Because the extent was allocated in | ||
2515 | * this transaction, the transaction must be marked as synchronous. This is | ||
2516 | * true for all cases where the free/alloc/free occurs in the one transaction, | ||
2517 | * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. | ||
2518 | * This serves to catch violations of the second case quite effectively. | ||
2519 | * | ||
2520 | * The second case is where the free/alloc/free occur in different | ||
2521 | * transactions. In this case, the thread freeing the extent the second time | ||
2522 | * can't mark the extent busy immediately because it is already tracked in a | ||
2523 | * transaction that may be committing. When the log commit for the existing | ||
2524 | * busy extent completes, the busy extent will be removed from the tree. If we | ||
2525 | * allow the second busy insert to continue using that busy extent structure, | ||
2526 | * it can be freed before this transaction is safely in the log. Hence our | ||
2527 | * only option in this case is to force the log to remove the existing busy | ||
2528 | * extent from the list before we insert the new one with the current | ||
2529 | * transaction ID. | ||
2530 | * | ||
2531 | * The problem we are trying to avoid in the free-alloc-free in separate | ||
2532 | * transactions is most easily described with a timeline: | ||
2533 | * | ||
2534 | * Thread 1 Thread 2 Thread 3 xfslogd | ||
2535 | * xact alloc | ||
2536 | * free X | ||
2537 | * mark busy | ||
2538 | * commit xact | ||
2539 | * free xact | ||
2540 | * xact alloc | ||
2541 | * alloc X | ||
2542 | * busy search | ||
2543 | * mark xact sync | ||
2544 | * commit xact | ||
2545 | * free xact | ||
2546 | * force log | ||
2547 | * checkpoint starts | ||
2548 | * .... | ||
2549 | * xact alloc | ||
2550 | * free X | ||
2551 | * mark busy | ||
2552 | * finds match | ||
2553 | * *** KABOOM! *** | ||
2554 | * .... | ||
2555 | * log IO completes | ||
2556 | * unbusy X | ||
2557 | * checkpoint completes | ||
2558 | * | ||
2559 | * By issuing a log force in thread 3 @ "KABOOM", the thread will block until | ||
2560 | * the checkpoint completes, and the busy extent it matched will have been | ||
2561 | * removed from the tree when it is woken. Hence it can then continue safely. | ||
2562 | * | ||
2563 | * However, to ensure this matching process is robust, we need to use the | ||
2564 | * transaction ID for identifying transaction, as delayed logging results in | ||
2565 | * the busy extent and transaction lifecycles being different. i.e. the busy | ||
2566 | * extent is active for a lot longer than the transaction. Hence the | ||
2567 | * transaction structure can be freed and reallocated, then mark the same | ||
2568 | * extent busy again in the new transaction. In this case the new transaction | ||
2569 | * will have a different tid but can have the same address, and hence we need | ||
2570 | * to check against the tid. | ||
2571 | * | ||
2572 | * Future: for delayed logging, we could avoid the log force if the extent was | ||
2573 | * first freed in the current checkpoint sequence. This, however, requires the | ||
2574 | * ability to pin the current checkpoint in memory until this transaction | ||
2575 | * commits to ensure that both the original free and the current one combine | ||
2576 | * logically into the one checkpoint. If the checkpoint sequences are | ||
2577 | * different, however, we still need to wait on a log force. | ||
2484 | */ | 2578 | */ |
2485 | void | 2579 | void |
2486 | xfs_alloc_mark_busy(xfs_trans_t *tp, | 2580 | xfs_alloc_busy_insert( |
2487 | xfs_agnumber_t agno, | 2581 | struct xfs_trans *tp, |
2488 | xfs_agblock_t bno, | 2582 | xfs_agnumber_t agno, |
2489 | xfs_extlen_t len) | 2583 | xfs_agblock_t bno, |
2584 | xfs_extlen_t len) | ||
2490 | { | 2585 | { |
2491 | xfs_perag_busy_t *bsy; | 2586 | struct xfs_busy_extent *new; |
2587 | struct xfs_busy_extent *busyp; | ||
2492 | struct xfs_perag *pag; | 2588 | struct xfs_perag *pag; |
2493 | int n; | 2589 | struct rb_node **rbp; |
2590 | struct rb_node *parent; | ||
2591 | int match; | ||
2494 | 2592 | ||
2495 | pag = xfs_perag_get(tp->t_mountp, agno); | ||
2496 | spin_lock(&pag->pagb_lock); | ||
2497 | 2593 | ||
2498 | /* search pagb_list for an open slot */ | 2594 | new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); |
2499 | for (bsy = pag->pagb_list, n = 0; | 2595 | if (!new) { |
2500 | n < XFS_PAGB_NUM_SLOTS; | 2596 | /* |
2501 | bsy++, n++) { | 2597 | * No Memory! Since it is now not possible to track the free |
2502 | if (bsy->busy_tp == NULL) { | 2598 | * block, make this a synchronous transaction to insure that |
2503 | break; | 2599 | * the block is not reused before this transaction commits. |
2504 | } | 2600 | */ |
2601 | trace_xfs_alloc_busy(tp, agno, bno, len, 1); | ||
2602 | xfs_trans_set_sync(tp); | ||
2603 | return; | ||
2505 | } | 2604 | } |
2506 | 2605 | ||
2507 | trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); | 2606 | new->agno = agno; |
2607 | new->bno = bno; | ||
2608 | new->length = len; | ||
2609 | new->tid = xfs_log_get_trans_ident(tp); | ||
2508 | 2610 | ||
2509 | if (n < XFS_PAGB_NUM_SLOTS) { | 2611 | INIT_LIST_HEAD(&new->list); |
2510 | bsy = &pag->pagb_list[n]; | 2612 | |
2511 | pag->pagb_count++; | 2613 | /* trace before insert to be able to see failed inserts */ |
2512 | bsy->busy_start = bno; | 2614 | trace_xfs_alloc_busy(tp, agno, bno, len, 0); |
2513 | bsy->busy_length = len; | 2615 | |
2514 | bsy->busy_tp = tp; | 2616 | pag = xfs_perag_get(tp->t_mountp, new->agno); |
2515 | xfs_trans_add_busy(tp, agno, n); | 2617 | restart: |
2516 | } else { | 2618 | spin_lock(&pag->pagb_lock); |
2619 | rbp = &pag->pagb_tree.rb_node; | ||
2620 | parent = NULL; | ||
2621 | busyp = NULL; | ||
2622 | match = 0; | ||
2623 | while (*rbp && match >= 0) { | ||
2624 | parent = *rbp; | ||
2625 | busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); | ||
2626 | |||
2627 | if (new->bno < busyp->bno) { | ||
2628 | /* may overlap, but exact start block is lower */ | ||
2629 | rbp = &(*rbp)->rb_left; | ||
2630 | if (new->bno + new->length > busyp->bno) | ||
2631 | match = busyp->tid == new->tid ? 1 : -1; | ||
2632 | } else if (new->bno > busyp->bno) { | ||
2633 | /* may overlap, but exact start block is higher */ | ||
2634 | rbp = &(*rbp)->rb_right; | ||
2635 | if (bno < busyp->bno + busyp->length) | ||
2636 | match = busyp->tid == new->tid ? 1 : -1; | ||
2637 | } else { | ||
2638 | match = busyp->tid == new->tid ? 1 : -1; | ||
2639 | break; | ||
2640 | } | ||
2641 | } | ||
2642 | if (match < 0) { | ||
2643 | /* overlap marked busy in different transaction */ | ||
2644 | spin_unlock(&pag->pagb_lock); | ||
2645 | xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); | ||
2646 | goto restart; | ||
2647 | } | ||
2648 | if (match > 0) { | ||
2517 | /* | 2649 | /* |
2518 | * The busy list is full! Since it is now not possible to | 2650 | * overlap marked busy in same transaction. Update if exact |
2519 | * track the free block, make this a synchronous transaction | 2651 | * start block match, otherwise combine the busy extents into |
2520 | * to insure that the block is not reused before this | 2652 | * a single range. |
2521 | * transaction commits. | ||
2522 | */ | 2653 | */ |
2523 | xfs_trans_set_sync(tp); | 2654 | if (busyp->bno == new->bno) { |
2524 | } | 2655 | busyp->length = max(busyp->length, new->length); |
2656 | spin_unlock(&pag->pagb_lock); | ||
2657 | ASSERT(tp->t_flags & XFS_TRANS_SYNC); | ||
2658 | xfs_perag_put(pag); | ||
2659 | kmem_free(new); | ||
2660 | return; | ||
2661 | } | ||
2662 | rb_erase(&busyp->rb_node, &pag->pagb_tree); | ||
2663 | new->length = max(busyp->bno + busyp->length, | ||
2664 | new->bno + new->length) - | ||
2665 | min(busyp->bno, new->bno); | ||
2666 | new->bno = min(busyp->bno, new->bno); | ||
2667 | } else | ||
2668 | busyp = NULL; | ||
2525 | 2669 | ||
2670 | rb_link_node(&new->rb_node, parent, rbp); | ||
2671 | rb_insert_color(&new->rb_node, &pag->pagb_tree); | ||
2672 | |||
2673 | list_add(&new->list, &tp->t_busy); | ||
2526 | spin_unlock(&pag->pagb_lock); | 2674 | spin_unlock(&pag->pagb_lock); |
2527 | xfs_perag_put(pag); | 2675 | xfs_perag_put(pag); |
2676 | kmem_free(busyp); | ||
2528 | } | 2677 | } |
2529 | 2678 | ||
2530 | void | 2679 | /* |
2531 | xfs_alloc_clear_busy(xfs_trans_t *tp, | 2680 | * Search for a busy extent within the range of the extent we are about to |
2532 | xfs_agnumber_t agno, | 2681 | * allocate. You need to be holding the busy extent tree lock when calling |
2533 | int idx) | 2682 | * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy |
2683 | * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact | ||
2684 | * match. This is done so that a non-zero return indicates an overlap that | ||
2685 | * will require a synchronous transaction, but it can still be | ||
2686 | * used to distinguish between a partial or exact match. | ||
2687 | */ | ||
2688 | static int | ||
2689 | xfs_alloc_busy_search( | ||
2690 | struct xfs_mount *mp, | ||
2691 | xfs_agnumber_t agno, | ||
2692 | xfs_agblock_t bno, | ||
2693 | xfs_extlen_t len) | ||
2534 | { | 2694 | { |
2535 | struct xfs_perag *pag; | 2695 | struct xfs_perag *pag; |
2536 | xfs_perag_busy_t *list; | 2696 | struct rb_node *rbp; |
2697 | struct xfs_busy_extent *busyp; | ||
2698 | int match = 0; | ||
2537 | 2699 | ||
2538 | ASSERT(idx < XFS_PAGB_NUM_SLOTS); | 2700 | pag = xfs_perag_get(mp, agno); |
2539 | pag = xfs_perag_get(tp->t_mountp, agno); | ||
2540 | spin_lock(&pag->pagb_lock); | 2701 | spin_lock(&pag->pagb_lock); |
2541 | list = pag->pagb_list; | ||
2542 | 2702 | ||
2543 | trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); | 2703 | rbp = pag->pagb_tree.rb_node; |
2544 | 2704 | ||
2545 | if (list[idx].busy_tp == tp) { | 2705 | /* find closest start bno overlap */ |
2546 | list[idx].busy_tp = NULL; | 2706 | while (rbp) { |
2547 | pag->pagb_count--; | 2707 | busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); |
2708 | if (bno < busyp->bno) { | ||
2709 | /* may overlap, but exact start block is lower */ | ||
2710 | if (bno + len > busyp->bno) | ||
2711 | match = -1; | ||
2712 | rbp = rbp->rb_left; | ||
2713 | } else if (bno > busyp->bno) { | ||
2714 | /* may overlap, but exact start block is higher */ | ||
2715 | if (bno < busyp->bno + busyp->length) | ||
2716 | match = -1; | ||
2717 | rbp = rbp->rb_right; | ||
2718 | } else { | ||
2719 | /* bno matches busyp, length determines exact match */ | ||
2720 | match = (busyp->length == len) ? 1 : -1; | ||
2721 | break; | ||
2722 | } | ||
2548 | } | 2723 | } |
2549 | |||
2550 | spin_unlock(&pag->pagb_lock); | 2724 | spin_unlock(&pag->pagb_lock); |
2725 | trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); | ||
2551 | xfs_perag_put(pag); | 2726 | xfs_perag_put(pag); |
2727 | return match; | ||
2552 | } | 2728 | } |
2553 | 2729 | ||
2554 | 2730 | void | |
2555 | /* | 2731 | xfs_alloc_busy_clear( |
2556 | * If we find the extent in the busy list, force the log out to get the | 2732 | struct xfs_mount *mp, |
2557 | * extent out of the busy list so the caller can use it straight away. | 2733 | struct xfs_busy_extent *busyp) |
2558 | */ | ||
2559 | STATIC void | ||
2560 | xfs_alloc_search_busy(xfs_trans_t *tp, | ||
2561 | xfs_agnumber_t agno, | ||
2562 | xfs_agblock_t bno, | ||
2563 | xfs_extlen_t len) | ||
2564 | { | 2734 | { |
2565 | struct xfs_perag *pag; | 2735 | struct xfs_perag *pag; |
2566 | xfs_perag_busy_t *bsy; | ||
2567 | xfs_agblock_t uend, bend; | ||
2568 | xfs_lsn_t lsn = 0; | ||
2569 | int cnt; | ||
2570 | 2736 | ||
2571 | pag = xfs_perag_get(tp->t_mountp, agno); | 2737 | trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, |
2572 | spin_lock(&pag->pagb_lock); | 2738 | busyp->length); |
2573 | cnt = pag->pagb_count; | ||
2574 | 2739 | ||
2575 | /* | 2740 | ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, |
2576 | * search pagb_list for this slot, skipping open slots. We have to | 2741 | busyp->length) == 1); |
2577 | * search the entire array as there may be multiple overlaps and | ||
2578 | * we have to get the most recent LSN for the log force to push out | ||
2579 | * all the transactions that span the range. | ||
2580 | */ | ||
2581 | uend = bno + len - 1; | ||
2582 | for (cnt = 0; cnt < pag->pagb_count; cnt++) { | ||
2583 | bsy = &pag->pagb_list[cnt]; | ||
2584 | if (!bsy->busy_tp) | ||
2585 | continue; | ||
2586 | 2742 | ||
2587 | bend = bsy->busy_start + bsy->busy_length - 1; | 2743 | list_del_init(&busyp->list); |
2588 | if (bno > bend || uend < bsy->busy_start) | ||
2589 | continue; | ||
2590 | 2744 | ||
2591 | /* (start1,length1) within (start2, length2) */ | 2745 | pag = xfs_perag_get(mp, busyp->agno); |
2592 | if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) | 2746 | spin_lock(&pag->pagb_lock); |
2593 | lsn = bsy->busy_tp->t_commit_lsn; | 2747 | rb_erase(&busyp->rb_node, &pag->pagb_tree); |
2594 | } | ||
2595 | spin_unlock(&pag->pagb_lock); | 2748 | spin_unlock(&pag->pagb_lock); |
2596 | xfs_perag_put(pag); | 2749 | xfs_perag_put(pag); |
2597 | trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); | ||
2598 | 2750 | ||
2599 | /* | 2751 | kmem_free(busyp); |
2600 | * If a block was found, force the log through the LSN of the | ||
2601 | * transaction that freed the block | ||
2602 | */ | ||
2603 | if (lsn) | ||
2604 | xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC); | ||
2605 | } | 2752 | } |
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 599bffa39784..6d05199b667c 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h | |||
@@ -22,6 +22,7 @@ struct xfs_buf; | |||
22 | struct xfs_mount; | 22 | struct xfs_mount; |
23 | struct xfs_perag; | 23 | struct xfs_perag; |
24 | struct xfs_trans; | 24 | struct xfs_trans; |
25 | struct xfs_busy_extent; | ||
25 | 26 | ||
26 | /* | 27 | /* |
27 | * Freespace allocation types. Argument to xfs_alloc_[v]extent. | 28 | * Freespace allocation types. Argument to xfs_alloc_[v]extent. |
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp, | |||
119 | #ifdef __KERNEL__ | 120 | #ifdef __KERNEL__ |
120 | 121 | ||
121 | void | 122 | void |
122 | xfs_alloc_mark_busy(xfs_trans_t *tp, | 123 | xfs_alloc_busy_insert(xfs_trans_t *tp, |
123 | xfs_agnumber_t agno, | 124 | xfs_agnumber_t agno, |
124 | xfs_agblock_t bno, | 125 | xfs_agblock_t bno, |
125 | xfs_extlen_t len); | 126 | xfs_extlen_t len); |
126 | 127 | ||
127 | void | 128 | void |
128 | xfs_alloc_clear_busy(xfs_trans_t *tp, | 129 | xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); |
129 | xfs_agnumber_t ag, | ||
130 | int idx); | ||
131 | 130 | ||
132 | #endif /* __KERNEL__ */ | 131 | #endif /* __KERNEL__ */ |
133 | 132 | ||
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index b726e10d2c1c..83f494218759 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c | |||
@@ -134,7 +134,7 @@ xfs_allocbt_free_block( | |||
134 | * disk. If a busy block is allocated, the iclog is pushed up to the | 134 | * disk. If a busy block is allocated, the iclog is pushed up to the |
135 | * LSN that freed the block. | 135 | * LSN that freed the block. |
136 | */ | 136 | */ |
137 | xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); | 137 | xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); |
138 | xfs_trans_agbtree_delta(cur->bc_tp, -1); | 138 | xfs_trans_agbtree_delta(cur->bc_tp, -1); |
139 | return 0; | 139 | return 0; |
140 | } | 140 | } |
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 240340a4727b..02a80984aa05 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c | |||
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug( | |||
64 | nbytes = last - first + 1; | 64 | nbytes = last - first + 1; |
65 | bfset(bip->bli_logged, first, nbytes); | 65 | bfset(bip->bli_logged, first, nbytes); |
66 | for (x = 0; x < nbytes; x++) { | 66 | for (x = 0; x < nbytes; x++) { |
67 | chunk_num = byte >> XFS_BLI_SHIFT; | 67 | chunk_num = byte >> XFS_BLF_SHIFT; |
68 | word_num = chunk_num >> BIT_TO_WORD_SHIFT; | 68 | word_num = chunk_num >> BIT_TO_WORD_SHIFT; |
69 | bit_num = chunk_num & (NBWORD - 1); | 69 | bit_num = chunk_num & (NBWORD - 1); |
70 | wordp = &(bip->bli_format.blf_data_map[word_num]); | 70 | wordp = &(bip->bli_format.blf_data_map[word_num]); |
@@ -166,7 +166,7 @@ xfs_buf_item_size( | |||
166 | * cancel flag in it. | 166 | * cancel flag in it. |
167 | */ | 167 | */ |
168 | trace_xfs_buf_item_size_stale(bip); | 168 | trace_xfs_buf_item_size_stale(bip); |
169 | ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); | 169 | ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); |
170 | return 1; | 170 | return 1; |
171 | } | 171 | } |
172 | 172 | ||
@@ -197,9 +197,9 @@ xfs_buf_item_size( | |||
197 | } else if (next_bit != last_bit + 1) { | 197 | } else if (next_bit != last_bit + 1) { |
198 | last_bit = next_bit; | 198 | last_bit = next_bit; |
199 | nvecs++; | 199 | nvecs++; |
200 | } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != | 200 | } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != |
201 | (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + | 201 | (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + |
202 | XFS_BLI_CHUNK)) { | 202 | XFS_BLF_CHUNK)) { |
203 | last_bit = next_bit; | 203 | last_bit = next_bit; |
204 | nvecs++; | 204 | nvecs++; |
205 | } else { | 205 | } else { |
@@ -254,6 +254,20 @@ xfs_buf_item_format( | |||
254 | vecp++; | 254 | vecp++; |
255 | nvecs = 1; | 255 | nvecs = 1; |
256 | 256 | ||
257 | /* | ||
258 | * If it is an inode buffer, transfer the in-memory state to the | ||
259 | * format flags and clear the in-memory state. We do not transfer | ||
260 | * this state if the inode buffer allocation has not yet been committed | ||
261 | * to the log as setting the XFS_BLI_INODE_BUF flag will prevent | ||
262 | * correct replay of the inode allocation. | ||
263 | */ | ||
264 | if (bip->bli_flags & XFS_BLI_INODE_BUF) { | ||
265 | if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && | ||
266 | xfs_log_item_in_current_chkpt(&bip->bli_item))) | ||
267 | bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF; | ||
268 | bip->bli_flags &= ~XFS_BLI_INODE_BUF; | ||
269 | } | ||
270 | |||
257 | if (bip->bli_flags & XFS_BLI_STALE) { | 271 | if (bip->bli_flags & XFS_BLI_STALE) { |
258 | /* | 272 | /* |
259 | * The buffer is stale, so all we need to log | 273 | * The buffer is stale, so all we need to log |
@@ -261,7 +275,7 @@ xfs_buf_item_format( | |||
261 | * cancel flag in it. | 275 | * cancel flag in it. |
262 | */ | 276 | */ |
263 | trace_xfs_buf_item_format_stale(bip); | 277 | trace_xfs_buf_item_format_stale(bip); |
264 | ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); | 278 | ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); |
265 | bip->bli_format.blf_size = nvecs; | 279 | bip->bli_format.blf_size = nvecs; |
266 | return; | 280 | return; |
267 | } | 281 | } |
@@ -294,28 +308,28 @@ xfs_buf_item_format( | |||
294 | * keep counting and scanning. | 308 | * keep counting and scanning. |
295 | */ | 309 | */ |
296 | if (next_bit == -1) { | 310 | if (next_bit == -1) { |
297 | buffer_offset = first_bit * XFS_BLI_CHUNK; | 311 | buffer_offset = first_bit * XFS_BLF_CHUNK; |
298 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); | 312 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); |
299 | vecp->i_len = nbits * XFS_BLI_CHUNK; | 313 | vecp->i_len = nbits * XFS_BLF_CHUNK; |
300 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; | 314 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; |
301 | nvecs++; | 315 | nvecs++; |
302 | break; | 316 | break; |
303 | } else if (next_bit != last_bit + 1) { | 317 | } else if (next_bit != last_bit + 1) { |
304 | buffer_offset = first_bit * XFS_BLI_CHUNK; | 318 | buffer_offset = first_bit * XFS_BLF_CHUNK; |
305 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); | 319 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); |
306 | vecp->i_len = nbits * XFS_BLI_CHUNK; | 320 | vecp->i_len = nbits * XFS_BLF_CHUNK; |
307 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; | 321 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; |
308 | nvecs++; | 322 | nvecs++; |
309 | vecp++; | 323 | vecp++; |
310 | first_bit = next_bit; | 324 | first_bit = next_bit; |
311 | last_bit = next_bit; | 325 | last_bit = next_bit; |
312 | nbits = 1; | 326 | nbits = 1; |
313 | } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != | 327 | } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) != |
314 | (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + | 328 | (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) + |
315 | XFS_BLI_CHUNK)) { | 329 | XFS_BLF_CHUNK)) { |
316 | buffer_offset = first_bit * XFS_BLI_CHUNK; | 330 | buffer_offset = first_bit * XFS_BLF_CHUNK; |
317 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); | 331 | vecp->i_addr = xfs_buf_offset(bp, buffer_offset); |
318 | vecp->i_len = nbits * XFS_BLI_CHUNK; | 332 | vecp->i_len = nbits * XFS_BLF_CHUNK; |
319 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; | 333 | vecp->i_type = XLOG_REG_TYPE_BCHUNK; |
320 | /* You would think we need to bump the nvecs here too, but we do not | 334 | /* You would think we need to bump the nvecs here too, but we do not |
321 | * this number is used by recovery, and it gets confused by the boundary | 335 | * this number is used by recovery, and it gets confused by the boundary |
@@ -341,10 +355,15 @@ xfs_buf_item_format( | |||
341 | } | 355 | } |
342 | 356 | ||
343 | /* | 357 | /* |
344 | * This is called to pin the buffer associated with the buf log | 358 | * This is called to pin the buffer associated with the buf log item in memory |
345 | * item in memory so it cannot be written out. Simply call bpin() | 359 | * so it cannot be written out. Simply call bpin() on the buffer to do this. |
346 | * on the buffer to do this. | 360 | * |
361 | * We also always take a reference to the buffer log item here so that the bli | ||
362 | * is held while the item is pinned in memory. This means that we can | ||
363 | * unconditionally drop the reference count a transaction holds when the | ||
364 | * transaction is completed. | ||
347 | */ | 365 | */ |
366 | |||
348 | STATIC void | 367 | STATIC void |
349 | xfs_buf_item_pin( | 368 | xfs_buf_item_pin( |
350 | xfs_buf_log_item_t *bip) | 369 | xfs_buf_log_item_t *bip) |
@@ -356,6 +375,7 @@ xfs_buf_item_pin( | |||
356 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 375 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
357 | ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || | 376 | ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || |
358 | (bip->bli_flags & XFS_BLI_STALE)); | 377 | (bip->bli_flags & XFS_BLI_STALE)); |
378 | atomic_inc(&bip->bli_refcount); | ||
359 | trace_xfs_buf_item_pin(bip); | 379 | trace_xfs_buf_item_pin(bip); |
360 | xfs_bpin(bp); | 380 | xfs_bpin(bp); |
361 | } | 381 | } |
@@ -393,7 +413,7 @@ xfs_buf_item_unpin( | |||
393 | ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); | 413 | ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); |
394 | ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); | 414 | ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); |
395 | ASSERT(XFS_BUF_ISSTALE(bp)); | 415 | ASSERT(XFS_BUF_ISSTALE(bp)); |
396 | ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); | 416 | ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); |
397 | trace_xfs_buf_item_unpin_stale(bip); | 417 | trace_xfs_buf_item_unpin_stale(bip); |
398 | 418 | ||
399 | /* | 419 | /* |
@@ -489,20 +509,23 @@ xfs_buf_item_trylock( | |||
489 | } | 509 | } |
490 | 510 | ||
491 | /* | 511 | /* |
492 | * Release the buffer associated with the buf log item. | 512 | * Release the buffer associated with the buf log item. If there is no dirty |
493 | * If there is no dirty logged data associated with the | 513 | * logged data associated with the buffer recorded in the buf log item, then |
494 | * buffer recorded in the buf log item, then free the | 514 | * free the buf log item and remove the reference to it in the buffer. |
495 | * buf log item and remove the reference to it in the | 515 | * |
496 | * buffer. | 516 | * This call ignores the recursion count. It is only called when the buffer |
517 | * should REALLY be unlocked, regardless of the recursion count. | ||
497 | * | 518 | * |
498 | * This call ignores the recursion count. It is only called | 519 | * We unconditionally drop the transaction's reference to the log item. If the |
499 | * when the buffer should REALLY be unlocked, regardless | 520 | * item was logged, then another reference was taken when it was pinned, so we |
500 | * of the recursion count. | 521 | * can safely drop the transaction reference now. This also allows us to avoid |
522 | * potential races with the unpin code freeing the bli by not referencing the | ||
523 | * bli after we've dropped the reference count. | ||
501 | * | 524 | * |
502 | * If the XFS_BLI_HOLD flag is set in the buf log item, then | 525 | * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item |
503 | * free the log item if necessary but do not unlock the buffer. | 526 | * if necessary but do not unlock the buffer. This is for support of |
504 | * This is for support of xfs_trans_bhold(). Make sure the | 527 | * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't |
505 | * XFS_BLI_HOLD field is cleared if we don't free the item. | 528 | * free the item. |
506 | */ | 529 | */ |
507 | STATIC void | 530 | STATIC void |
508 | xfs_buf_item_unlock( | 531 | xfs_buf_item_unlock( |
@@ -514,73 +537,54 @@ xfs_buf_item_unlock( | |||
514 | 537 | ||
515 | bp = bip->bli_buf; | 538 | bp = bip->bli_buf; |
516 | 539 | ||
517 | /* | 540 | /* Clear the buffer's association with this transaction. */ |
518 | * Clear the buffer's association with this transaction. | ||
519 | */ | ||
520 | XFS_BUF_SET_FSPRIVATE2(bp, NULL); | 541 | XFS_BUF_SET_FSPRIVATE2(bp, NULL); |
521 | 542 | ||
522 | /* | 543 | /* |
523 | * If this is a transaction abort, don't return early. | 544 | * If this is a transaction abort, don't return early. Instead, allow |
524 | * Instead, allow the brelse to happen. | 545 | * the brelse to happen. Normally it would be done for stale |
525 | * Normally it would be done for stale (cancelled) buffers | 546 | * (cancelled) buffers at unpin time, but we'll never go through the |
526 | * at unpin time, but we'll never go through the pin/unpin | 547 | * pin/unpin cycle if we abort inside commit. |
527 | * cycle if we abort inside commit. | ||
528 | */ | 548 | */ |
529 | aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; | 549 | aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; |
530 | 550 | ||
531 | /* | 551 | /* |
532 | * If the buf item is marked stale, then don't do anything. | 552 | * Before possibly freeing the buf item, determine if we should |
533 | * We'll unlock the buffer and free the buf item when the | 553 | * release the buffer at the end of this routine. |
534 | * buffer is unpinned for the last time. | ||
535 | */ | 554 | */ |
536 | if (bip->bli_flags & XFS_BLI_STALE) { | 555 | hold = bip->bli_flags & XFS_BLI_HOLD; |
537 | bip->bli_flags &= ~XFS_BLI_LOGGED; | 556 | |
538 | trace_xfs_buf_item_unlock_stale(bip); | 557 | /* Clear the per transaction state. */ |
539 | ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); | 558 | bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD); |
540 | if (!aborted) | ||
541 | return; | ||
542 | } | ||
543 | 559 | ||
544 | /* | 560 | /* |
545 | * Drop the transaction's reference to the log item if | 561 | * If the buf item is marked stale, then don't do anything. We'll |
546 | * it was not logged as part of the transaction. Otherwise | 562 | * unlock the buffer and free the buf item when the buffer is unpinned |
547 | * we'll drop the reference in xfs_buf_item_unpin() when | 563 | * for the last time. |
548 | * the transaction is really through with the buffer. | ||
549 | */ | 564 | */ |
550 | if (!(bip->bli_flags & XFS_BLI_LOGGED)) { | 565 | if (bip->bli_flags & XFS_BLI_STALE) { |
551 | atomic_dec(&bip->bli_refcount); | 566 | trace_xfs_buf_item_unlock_stale(bip); |
552 | } else { | 567 | ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); |
553 | /* | 568 | if (!aborted) { |
554 | * Clear the logged flag since this is per | 569 | atomic_dec(&bip->bli_refcount); |
555 | * transaction state. | 570 | return; |
556 | */ | 571 | } |
557 | bip->bli_flags &= ~XFS_BLI_LOGGED; | ||
558 | } | 572 | } |
559 | 573 | ||
560 | /* | ||
561 | * Before possibly freeing the buf item, determine if we should | ||
562 | * release the buffer at the end of this routine. | ||
563 | */ | ||
564 | hold = bip->bli_flags & XFS_BLI_HOLD; | ||
565 | trace_xfs_buf_item_unlock(bip); | 574 | trace_xfs_buf_item_unlock(bip); |
566 | 575 | ||
567 | /* | 576 | /* |
568 | * If the buf item isn't tracking any data, free it. | 577 | * If the buf item isn't tracking any data, free it, otherwise drop the |
569 | * Otherwise, if XFS_BLI_HOLD is set clear it. | 578 | * reference we hold to it. |
570 | */ | 579 | */ |
571 | if (xfs_bitmap_empty(bip->bli_format.blf_data_map, | 580 | if (xfs_bitmap_empty(bip->bli_format.blf_data_map, |
572 | bip->bli_format.blf_map_size)) { | 581 | bip->bli_format.blf_map_size)) |
573 | xfs_buf_item_relse(bp); | 582 | xfs_buf_item_relse(bp); |
574 | } else if (hold) { | 583 | else |
575 | bip->bli_flags &= ~XFS_BLI_HOLD; | 584 | atomic_dec(&bip->bli_refcount); |
576 | } | ||
577 | 585 | ||
578 | /* | 586 | if (!hold) |
579 | * Release the buffer if XFS_BLI_HOLD was not set. | ||
580 | */ | ||
581 | if (!hold) { | ||
582 | xfs_buf_relse(bp); | 587 | xfs_buf_relse(bp); |
583 | } | ||
584 | } | 588 | } |
585 | 589 | ||
586 | /* | 590 | /* |
@@ -717,12 +721,12 @@ xfs_buf_item_init( | |||
717 | } | 721 | } |
718 | 722 | ||
719 | /* | 723 | /* |
720 | * chunks is the number of XFS_BLI_CHUNK size pieces | 724 | * chunks is the number of XFS_BLF_CHUNK size pieces |
721 | * the buffer can be divided into. Make sure not to | 725 | * the buffer can be divided into. Make sure not to |
722 | * truncate any pieces. map_size is the size of the | 726 | * truncate any pieces. map_size is the size of the |
723 | * bitmap needed to describe the chunks of the buffer. | 727 | * bitmap needed to describe the chunks of the buffer. |
724 | */ | 728 | */ |
725 | chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); | 729 | chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT); |
726 | map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); | 730 | map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); |
727 | 731 | ||
728 | bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, | 732 | bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, |
@@ -790,8 +794,8 @@ xfs_buf_item_log( | |||
790 | /* | 794 | /* |
791 | * Convert byte offsets to bit numbers. | 795 | * Convert byte offsets to bit numbers. |
792 | */ | 796 | */ |
793 | first_bit = first >> XFS_BLI_SHIFT; | 797 | first_bit = first >> XFS_BLF_SHIFT; |
794 | last_bit = last >> XFS_BLI_SHIFT; | 798 | last_bit = last >> XFS_BLF_SHIFT; |
795 | 799 | ||
796 | /* | 800 | /* |
797 | * Calculate the total number of bits to be set. | 801 | * Calculate the total number of bits to be set. |
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index df4454511f73..f20bb472d582 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h | |||
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format { | |||
41 | * This flag indicates that the buffer contains on disk inodes | 41 | * This flag indicates that the buffer contains on disk inodes |
42 | * and requires special recovery handling. | 42 | * and requires special recovery handling. |
43 | */ | 43 | */ |
44 | #define XFS_BLI_INODE_BUF 0x1 | 44 | #define XFS_BLF_INODE_BUF 0x1 |
45 | /* | 45 | /* |
46 | * This flag indicates that the buffer should not be replayed | 46 | * This flag indicates that the buffer should not be replayed |
47 | * during recovery because its blocks are being freed. | 47 | * during recovery because its blocks are being freed. |
48 | */ | 48 | */ |
49 | #define XFS_BLI_CANCEL 0x2 | 49 | #define XFS_BLF_CANCEL 0x2 |
50 | /* | 50 | /* |
51 | * This flag indicates that the buffer contains on disk | 51 | * This flag indicates that the buffer contains on disk |
52 | * user or group dquots and may require special recovery handling. | 52 | * user or group dquots and may require special recovery handling. |
53 | */ | 53 | */ |
54 | #define XFS_BLI_UDQUOT_BUF 0x4 | 54 | #define XFS_BLF_UDQUOT_BUF 0x4 |
55 | #define XFS_BLI_PDQUOT_BUF 0x8 | 55 | #define XFS_BLF_PDQUOT_BUF 0x8 |
56 | #define XFS_BLI_GDQUOT_BUF 0x10 | 56 | #define XFS_BLF_GDQUOT_BUF 0x10 |
57 | 57 | ||
58 | #define XFS_BLI_CHUNK 128 | 58 | #define XFS_BLF_CHUNK 128 |
59 | #define XFS_BLI_SHIFT 7 | 59 | #define XFS_BLF_SHIFT 7 |
60 | #define BIT_TO_WORD_SHIFT 5 | 60 | #define BIT_TO_WORD_SHIFT 5 |
61 | #define NBWORD (NBBY * sizeof(unsigned int)) | 61 | #define NBWORD (NBBY * sizeof(unsigned int)) |
62 | 62 | ||
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format { | |||
69 | #define XFS_BLI_LOGGED 0x08 | 69 | #define XFS_BLI_LOGGED 0x08 |
70 | #define XFS_BLI_INODE_ALLOC_BUF 0x10 | 70 | #define XFS_BLI_INODE_ALLOC_BUF 0x10 |
71 | #define XFS_BLI_STALE_INODE 0x20 | 71 | #define XFS_BLI_STALE_INODE 0x20 |
72 | #define XFS_BLI_INODE_BUF 0x40 | ||
72 | 73 | ||
73 | #define XFS_BLI_FLAGS \ | 74 | #define XFS_BLI_FLAGS \ |
74 | { XFS_BLI_HOLD, "HOLD" }, \ | 75 | { XFS_BLI_HOLD, "HOLD" }, \ |
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format { | |||
76 | { XFS_BLI_STALE, "STALE" }, \ | 77 | { XFS_BLI_STALE, "STALE" }, \ |
77 | { XFS_BLI_LOGGED, "LOGGED" }, \ | 78 | { XFS_BLI_LOGGED, "LOGGED" }, \ |
78 | { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ | 79 | { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ |
79 | { XFS_BLI_STALE_INODE, "STALE_INODE" } | 80 | { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ |
81 | { XFS_BLI_INODE_BUF, "INODE_BUF" } | ||
80 | 82 | ||
81 | 83 | ||
82 | #ifdef __KERNEL__ | 84 | #ifdef __KERNEL__ |
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ef96175c0744..047b8a8e5c29 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c | |||
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) | |||
170 | va_list ap; | 170 | va_list ap; |
171 | 171 | ||
172 | #ifdef DEBUG | 172 | #ifdef DEBUG |
173 | xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; | 173 | xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); |
174 | #endif | 174 | #endif |
175 | 175 | ||
176 | if (xfs_panic_mask && (xfs_panic_mask & panic_tag) | 176 | if (xfs_panic_mask && (xfs_panic_mask & panic_tag) |
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3038dd52c72a..5215abc8023a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c | |||
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, | |||
54 | STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); | 54 | STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); |
55 | STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); | 55 | STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); |
56 | STATIC void xlog_dealloc_log(xlog_t *log); | 56 | STATIC void xlog_dealloc_log(xlog_t *log); |
57 | STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector, | ||
58 | struct xlog_ticket *tic, xfs_lsn_t *start_lsn, | ||
59 | xlog_in_core_t **commit_iclog, uint flags); | ||
60 | 57 | ||
61 | /* local state machine functions */ | 58 | /* local state machine functions */ |
62 | STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); | 59 | STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); |
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log, | |||
86 | STATIC void xlog_ungrant_log_space(xlog_t *log, | 83 | STATIC void xlog_ungrant_log_space(xlog_t *log, |
87 | xlog_ticket_t *ticket); | 84 | xlog_ticket_t *ticket); |
88 | 85 | ||
89 | |||
90 | /* local ticket functions */ | ||
91 | STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log, | ||
92 | int unit_bytes, | ||
93 | int count, | ||
94 | char clientid, | ||
95 | uint flags); | ||
96 | |||
97 | #if defined(DEBUG) | 86 | #if defined(DEBUG) |
98 | STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); | 87 | STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); |
99 | STATIC void xlog_verify_grant_head(xlog_t *log, int equals); | 88 | STATIC void xlog_verify_grant_head(xlog_t *log, int equals); |
@@ -360,6 +349,15 @@ xfs_log_reserve( | |||
360 | ASSERT(flags & XFS_LOG_PERM_RESERV); | 349 | ASSERT(flags & XFS_LOG_PERM_RESERV); |
361 | internal_ticket = *ticket; | 350 | internal_ticket = *ticket; |
362 | 351 | ||
352 | /* | ||
353 | * this is a new transaction on the ticket, so we need to | ||
354 | * change the transaction ID so that the next transaction has a | ||
355 | * different TID in the log. Just add one to the existing tid | ||
356 | * so that we can see chains of rolling transactions in the log | ||
357 | * easily. | ||
358 | */ | ||
359 | internal_ticket->t_tid++; | ||
360 | |||
363 | trace_xfs_log_reserve(log, internal_ticket); | 361 | trace_xfs_log_reserve(log, internal_ticket); |
364 | 362 | ||
365 | xlog_grant_push_ail(mp, internal_ticket->t_unit_res); | 363 | xlog_grant_push_ail(mp, internal_ticket->t_unit_res); |
@@ -367,7 +365,8 @@ xfs_log_reserve( | |||
367 | } else { | 365 | } else { |
368 | /* may sleep if need to allocate more tickets */ | 366 | /* may sleep if need to allocate more tickets */ |
369 | internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, | 367 | internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, |
370 | client, flags); | 368 | client, flags, |
369 | KM_SLEEP|KM_MAYFAIL); | ||
371 | if (!internal_ticket) | 370 | if (!internal_ticket) |
372 | return XFS_ERROR(ENOMEM); | 371 | return XFS_ERROR(ENOMEM); |
373 | internal_ticket->t_trans_type = t_type; | 372 | internal_ticket->t_trans_type = t_type; |
@@ -452,6 +451,13 @@ xfs_log_mount( | |||
452 | /* Normal transactions can now occur */ | 451 | /* Normal transactions can now occur */ |
453 | mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; | 452 | mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; |
454 | 453 | ||
454 | /* | ||
455 | * Now the log has been fully initialised and we know were our | ||
456 | * space grant counters are, we can initialise the permanent ticket | ||
457 | * needed for delayed logging to work. | ||
458 | */ | ||
459 | xlog_cil_init_post_recovery(mp->m_log); | ||
460 | |||
455 | return 0; | 461 | return 0; |
456 | 462 | ||
457 | out_destroy_ail: | 463 | out_destroy_ail: |
@@ -658,6 +664,10 @@ xfs_log_item_init( | |||
658 | item->li_ailp = mp->m_ail; | 664 | item->li_ailp = mp->m_ail; |
659 | item->li_type = type; | 665 | item->li_type = type; |
660 | item->li_ops = ops; | 666 | item->li_ops = ops; |
667 | item->li_lv = NULL; | ||
668 | |||
669 | INIT_LIST_HEAD(&item->li_ail); | ||
670 | INIT_LIST_HEAD(&item->li_cil); | ||
661 | } | 671 | } |
662 | 672 | ||
663 | /* | 673 | /* |
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp, | |||
1168 | *iclogp = log->l_iclog; /* complete ring */ | 1178 | *iclogp = log->l_iclog; /* complete ring */ |
1169 | log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ | 1179 | log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ |
1170 | 1180 | ||
1181 | error = xlog_cil_init(log); | ||
1182 | if (error) | ||
1183 | goto out_free_iclog; | ||
1171 | return log; | 1184 | return log; |
1172 | 1185 | ||
1173 | out_free_iclog: | 1186 | out_free_iclog: |
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log) | |||
1494 | xlog_in_core_t *iclog, *next_iclog; | 1507 | xlog_in_core_t *iclog, *next_iclog; |
1495 | int i; | 1508 | int i; |
1496 | 1509 | ||
1510 | xlog_cil_destroy(log); | ||
1511 | |||
1497 | iclog = log->l_iclog; | 1512 | iclog = log->l_iclog; |
1498 | for (i=0; i<log->l_iclog_bufs; i++) { | 1513 | for (i=0; i<log->l_iclog_bufs; i++) { |
1499 | sv_destroy(&iclog->ic_force_wait); | 1514 | sv_destroy(&iclog->ic_force_wait); |
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log, | |||
1536 | * print out info relating to regions written which consume | 1551 | * print out info relating to regions written which consume |
1537 | * the reservation | 1552 | * the reservation |
1538 | */ | 1553 | */ |
1539 | STATIC void | 1554 | void |
1540 | xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) | 1555 | xlog_print_tic_res( |
1556 | struct xfs_mount *mp, | ||
1557 | struct xlog_ticket *ticket) | ||
1541 | { | 1558 | { |
1542 | uint i; | 1559 | uint i; |
1543 | uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); | 1560 | uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); |
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) | |||
1637 | "bad-rtype" : res_type_str[r_type-1]), | 1654 | "bad-rtype" : res_type_str[r_type-1]), |
1638 | ticket->t_res_arr[i].r_len); | 1655 | ticket->t_res_arr[i].r_len); |
1639 | } | 1656 | } |
1657 | |||
1658 | xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, | ||
1659 | "xfs_log_write: reservation ran out. Need to up reservation"); | ||
1660 | xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); | ||
1640 | } | 1661 | } |
1641 | 1662 | ||
1642 | /* | 1663 | /* |
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish( | |||
1865 | * we don't update ic_offset until the end when we know exactly how many | 1886 | * we don't update ic_offset until the end when we know exactly how many |
1866 | * bytes have been written out. | 1887 | * bytes have been written out. |
1867 | */ | 1888 | */ |
1868 | STATIC int | 1889 | int |
1869 | xlog_write( | 1890 | xlog_write( |
1870 | struct log *log, | 1891 | struct log *log, |
1871 | struct xfs_log_vec *log_vector, | 1892 | struct xfs_log_vec *log_vector, |
@@ -1889,22 +1910,26 @@ xlog_write( | |||
1889 | *start_lsn = 0; | 1910 | *start_lsn = 0; |
1890 | 1911 | ||
1891 | len = xlog_write_calc_vec_length(ticket, log_vector); | 1912 | len = xlog_write_calc_vec_length(ticket, log_vector); |
1892 | if (ticket->t_curr_res < len) { | 1913 | if (log->l_cilp) { |
1893 | xlog_print_tic_res(log->l_mp, ticket); | 1914 | /* |
1894 | #ifdef DEBUG | 1915 | * Region headers and bytes are already accounted for. |
1895 | xlog_panic( | 1916 | * We only need to take into account start records and |
1896 | "xfs_log_write: reservation ran out. Need to up reservation"); | 1917 | * split regions in this function. |
1897 | #else | 1918 | */ |
1898 | /* Customer configurable panic */ | 1919 | if (ticket->t_flags & XLOG_TIC_INITED) |
1899 | xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, | 1920 | ticket->t_curr_res -= sizeof(xlog_op_header_t); |
1900 | "xfs_log_write: reservation ran out. Need to up reservation"); | ||
1901 | 1921 | ||
1902 | /* If we did not panic, shutdown the filesystem */ | 1922 | /* |
1903 | xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); | 1923 | * Commit record headers need to be accounted for. These |
1904 | #endif | 1924 | * come in as separate writes so are easy to detect. |
1905 | } | 1925 | */ |
1926 | if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) | ||
1927 | ticket->t_curr_res -= sizeof(xlog_op_header_t); | ||
1928 | } else | ||
1929 | ticket->t_curr_res -= len; | ||
1906 | 1930 | ||
1907 | ticket->t_curr_res -= len; | 1931 | if (ticket->t_curr_res < 0) |
1932 | xlog_print_tic_res(log->l_mp, ticket); | ||
1908 | 1933 | ||
1909 | index = 0; | 1934 | index = 0; |
1910 | lv = log_vector; | 1935 | lv = log_vector; |
@@ -3000,6 +3025,8 @@ _xfs_log_force( | |||
3000 | 3025 | ||
3001 | XFS_STATS_INC(xs_log_force); | 3026 | XFS_STATS_INC(xs_log_force); |
3002 | 3027 | ||
3028 | xlog_cil_push(log, 1); | ||
3029 | |||
3003 | spin_lock(&log->l_icloglock); | 3030 | spin_lock(&log->l_icloglock); |
3004 | 3031 | ||
3005 | iclog = log->l_iclog; | 3032 | iclog = log->l_iclog; |
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn( | |||
3149 | 3176 | ||
3150 | XFS_STATS_INC(xs_log_force); | 3177 | XFS_STATS_INC(xs_log_force); |
3151 | 3178 | ||
3179 | if (log->l_cilp) { | ||
3180 | lsn = xlog_cil_push_lsn(log, lsn); | ||
3181 | if (lsn == NULLCOMMITLSN) | ||
3182 | return 0; | ||
3183 | } | ||
3184 | |||
3152 | try_again: | 3185 | try_again: |
3153 | spin_lock(&log->l_icloglock); | 3186 | spin_lock(&log->l_icloglock); |
3154 | iclog = log->l_iclog; | 3187 | iclog = log->l_iclog; |
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get( | |||
3313 | return ticket; | 3346 | return ticket; |
3314 | } | 3347 | } |
3315 | 3348 | ||
3349 | xlog_tid_t | ||
3350 | xfs_log_get_trans_ident( | ||
3351 | struct xfs_trans *tp) | ||
3352 | { | ||
3353 | return tp->t_ticket->t_tid; | ||
3354 | } | ||
3355 | |||
3316 | /* | 3356 | /* |
3317 | * Allocate and initialise a new log ticket. | 3357 | * Allocate and initialise a new log ticket. |
3318 | */ | 3358 | */ |
3319 | STATIC xlog_ticket_t * | 3359 | xlog_ticket_t * |
3320 | xlog_ticket_alloc( | 3360 | xlog_ticket_alloc( |
3321 | struct log *log, | 3361 | struct log *log, |
3322 | int unit_bytes, | 3362 | int unit_bytes, |
3323 | int cnt, | 3363 | int cnt, |
3324 | char client, | 3364 | char client, |
3325 | uint xflags) | 3365 | uint xflags, |
3366 | int alloc_flags) | ||
3326 | { | 3367 | { |
3327 | struct xlog_ticket *tic; | 3368 | struct xlog_ticket *tic; |
3328 | uint num_headers; | 3369 | uint num_headers; |
3329 | int iclog_space; | 3370 | int iclog_space; |
3330 | 3371 | ||
3331 | tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); | 3372 | tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); |
3332 | if (!tic) | 3373 | if (!tic) |
3333 | return NULL; | 3374 | return NULL; |
3334 | 3375 | ||
@@ -3647,6 +3688,11 @@ xlog_state_ioerror( | |||
3647 | * c. nothing new gets queued up after (a) and (b) are done. | 3688 | * c. nothing new gets queued up after (a) and (b) are done. |
3648 | * d. if !logerror, flush the iclogs to disk, then seal them off | 3689 | * d. if !logerror, flush the iclogs to disk, then seal them off |
3649 | * for business. | 3690 | * for business. |
3691 | * | ||
3692 | * Note: for delayed logging the !logerror case needs to flush the regions | ||
3693 | * held in memory out to the iclogs before flushing them to disk. This needs | ||
3694 | * to be done before the log is marked as shutdown, otherwise the flush to the | ||
3695 | * iclogs will fail. | ||
3650 | */ | 3696 | */ |
3651 | int | 3697 | int |
3652 | xfs_log_force_umount( | 3698 | xfs_log_force_umount( |
@@ -3680,6 +3726,16 @@ xfs_log_force_umount( | |||
3680 | return 1; | 3726 | return 1; |
3681 | } | 3727 | } |
3682 | retval = 0; | 3728 | retval = 0; |
3729 | |||
3730 | /* | ||
3731 | * Flush the in memory commit item list before marking the log as | ||
3732 | * being shut down. We need to do it in this order to ensure all the | ||
3733 | * completed transactions are flushed to disk with the xfs_log_force() | ||
3734 | * call below. | ||
3735 | */ | ||
3736 | if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) | ||
3737 | xlog_cil_push(log, 1); | ||
3738 | |||
3683 | /* | 3739 | /* |
3684 | * We must hold both the GRANT lock and the LOG lock, | 3740 | * We must hold both the GRANT lock and the LOG lock, |
3685 | * before we mark the filesystem SHUTDOWN and wake | 3741 | * before we mark the filesystem SHUTDOWN and wake |
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 229d1f36ba9a..04c78e642cc8 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h | |||
@@ -19,7 +19,6 @@ | |||
19 | #define __XFS_LOG_H__ | 19 | #define __XFS_LOG_H__ |
20 | 20 | ||
21 | /* get lsn fields */ | 21 | /* get lsn fields */ |
22 | |||
23 | #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) | 22 | #define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) |
24 | #define BLOCK_LSN(lsn) ((uint)(lsn)) | 23 | #define BLOCK_LSN(lsn) ((uint)(lsn)) |
25 | 24 | ||
@@ -114,6 +113,9 @@ struct xfs_log_vec { | |||
114 | struct xfs_log_vec *lv_next; /* next lv in build list */ | 113 | struct xfs_log_vec *lv_next; /* next lv in build list */ |
115 | int lv_niovecs; /* number of iovecs in lv */ | 114 | int lv_niovecs; /* number of iovecs in lv */ |
116 | struct xfs_log_iovec *lv_iovecp; /* iovec array */ | 115 | struct xfs_log_iovec *lv_iovecp; /* iovec array */ |
116 | struct xfs_log_item *lv_item; /* owner */ | ||
117 | char *lv_buf; /* formatted buffer */ | ||
118 | int lv_buf_len; /* size of formatted buffer */ | ||
117 | }; | 119 | }; |
118 | 120 | ||
119 | /* | 121 | /* |
@@ -134,6 +136,7 @@ struct xlog_in_core; | |||
134 | struct xlog_ticket; | 136 | struct xlog_ticket; |
135 | struct xfs_log_item; | 137 | struct xfs_log_item; |
136 | struct xfs_item_ops; | 138 | struct xfs_item_ops; |
139 | struct xfs_trans; | ||
137 | 140 | ||
138 | void xfs_log_item_init(struct xfs_mount *mp, | 141 | void xfs_log_item_init(struct xfs_mount *mp, |
139 | struct xfs_log_item *item, | 142 | struct xfs_log_item *item, |
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp); | |||
187 | 190 | ||
188 | void xlog_iodone(struct xfs_buf *); | 191 | void xlog_iodone(struct xfs_buf *); |
189 | 192 | ||
190 | struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); | 193 | struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); |
191 | void xfs_log_ticket_put(struct xlog_ticket *ticket); | 194 | void xfs_log_ticket_put(struct xlog_ticket *ticket); |
192 | 195 | ||
196 | xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); | ||
197 | |||
198 | int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, | ||
199 | struct xfs_log_vec *log_vector, | ||
200 | xfs_lsn_t *commit_lsn, int flags); | ||
201 | bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); | ||
202 | |||
193 | #endif | 203 | #endif |
194 | 204 | ||
195 | 205 | ||
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 000000000000..bb17cc044bf3 --- /dev/null +++ b/fs/xfs/xfs_log_cil.c | |||
@@ -0,0 +1,725 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it would be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
11 | * GNU General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public License | ||
14 | * along with this program; if not, write the Free Software Foundation, | ||
15 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA | ||
16 | */ | ||
17 | |||
18 | #include "xfs.h" | ||
19 | #include "xfs_fs.h" | ||
20 | #include "xfs_types.h" | ||
21 | #include "xfs_bit.h" | ||
22 | #include "xfs_log.h" | ||
23 | #include "xfs_inum.h" | ||
24 | #include "xfs_trans.h" | ||
25 | #include "xfs_trans_priv.h" | ||
26 | #include "xfs_log_priv.h" | ||
27 | #include "xfs_sb.h" | ||
28 | #include "xfs_ag.h" | ||
29 | #include "xfs_dir2.h" | ||
30 | #include "xfs_dmapi.h" | ||
31 | #include "xfs_mount.h" | ||
32 | #include "xfs_error.h" | ||
33 | #include "xfs_alloc.h" | ||
34 | |||
35 | /* | ||
36 | * Perform initial CIL structure initialisation. If the CIL is not | ||
37 | * enabled in this filesystem, ensure the log->l_cilp is null so | ||
38 | * we can check this conditional to determine if we are doing delayed | ||
39 | * logging or not. | ||
40 | */ | ||
41 | int | ||
42 | xlog_cil_init( | ||
43 | struct log *log) | ||
44 | { | ||
45 | struct xfs_cil *cil; | ||
46 | struct xfs_cil_ctx *ctx; | ||
47 | |||
48 | log->l_cilp = NULL; | ||
49 | if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG)) | ||
50 | return 0; | ||
51 | |||
52 | cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); | ||
53 | if (!cil) | ||
54 | return ENOMEM; | ||
55 | |||
56 | ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); | ||
57 | if (!ctx) { | ||
58 | kmem_free(cil); | ||
59 | return ENOMEM; | ||
60 | } | ||
61 | |||
62 | INIT_LIST_HEAD(&cil->xc_cil); | ||
63 | INIT_LIST_HEAD(&cil->xc_committing); | ||
64 | spin_lock_init(&cil->xc_cil_lock); | ||
65 | init_rwsem(&cil->xc_ctx_lock); | ||
66 | sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); | ||
67 | |||
68 | INIT_LIST_HEAD(&ctx->committing); | ||
69 | INIT_LIST_HEAD(&ctx->busy_extents); | ||
70 | ctx->sequence = 1; | ||
71 | ctx->cil = cil; | ||
72 | cil->xc_ctx = ctx; | ||
73 | |||
74 | cil->xc_log = log; | ||
75 | log->l_cilp = cil; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | void | ||
80 | xlog_cil_destroy( | ||
81 | struct log *log) | ||
82 | { | ||
83 | if (!log->l_cilp) | ||
84 | return; | ||
85 | |||
86 | if (log->l_cilp->xc_ctx) { | ||
87 | if (log->l_cilp->xc_ctx->ticket) | ||
88 | xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); | ||
89 | kmem_free(log->l_cilp->xc_ctx); | ||
90 | } | ||
91 | |||
92 | ASSERT(list_empty(&log->l_cilp->xc_cil)); | ||
93 | kmem_free(log->l_cilp); | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Allocate a new ticket. Failing to get a new ticket makes it really hard to | ||
98 | * recover, so we don't allow failure here. Also, we allocate in a context that | ||
99 | * we don't want to be issuing transactions from, so we need to tell the | ||
100 | * allocation code this as well. | ||
101 | * | ||
102 | * We don't reserve any space for the ticket - we are going to steal whatever | ||
103 | * space we require from transactions as they commit. To ensure we reserve all | ||
104 | * the space required, we need to set the current reservation of the ticket to | ||
105 | * zero so that we know to steal the initial transaction overhead from the | ||
106 | * first transaction commit. | ||
107 | */ | ||
108 | static struct xlog_ticket * | ||
109 | xlog_cil_ticket_alloc( | ||
110 | struct log *log) | ||
111 | { | ||
112 | struct xlog_ticket *tic; | ||
113 | |||
114 | tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, | ||
115 | KM_SLEEP|KM_NOFS); | ||
116 | tic->t_trans_type = XFS_TRANS_CHECKPOINT; | ||
117 | |||
118 | /* | ||
119 | * set the current reservation to zero so we know to steal the basic | ||
120 | * transaction overhead reservation from the first transaction commit. | ||
121 | */ | ||
122 | tic->t_curr_res = 0; | ||
123 | return tic; | ||
124 | } | ||
125 | |||
126 | /* | ||
127 | * After the first stage of log recovery is done, we know where the head and | ||
128 | * tail of the log are. We need this log initialisation done before we can | ||
129 | * initialise the first CIL checkpoint context. | ||
130 | * | ||
131 | * Here we allocate a log ticket to track space usage during a CIL push. This | ||
132 | * ticket is passed to xlog_write() directly so that we don't slowly leak log | ||
133 | * space by failing to account for space used by log headers and additional | ||
134 | * region headers for split regions. | ||
135 | */ | ||
136 | void | ||
137 | xlog_cil_init_post_recovery( | ||
138 | struct log *log) | ||
139 | { | ||
140 | if (!log->l_cilp) | ||
141 | return; | ||
142 | |||
143 | log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); | ||
144 | log->l_cilp->xc_ctx->sequence = 1; | ||
145 | log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, | ||
146 | log->l_curr_block); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Insert the log item into the CIL and calculate the difference in space | ||
151 | * consumed by the item. Add the space to the checkpoint ticket and calculate | ||
152 | * if the change requires additional log metadata. If it does, take that space | ||
153 | * as well. Remove the amount of space we addded to the checkpoint ticket from | ||
154 | * the current transaction ticket so that the accounting works out correctly. | ||
155 | * | ||
156 | * If this is the first time the item is being placed into the CIL in this | ||
157 | * context, pin it so it can't be written to disk until the CIL is flushed to | ||
158 | * the iclog and the iclog written to disk. | ||
159 | */ | ||
160 | static void | ||
161 | xlog_cil_insert( | ||
162 | struct log *log, | ||
163 | struct xlog_ticket *ticket, | ||
164 | struct xfs_log_item *item, | ||
165 | struct xfs_log_vec *lv) | ||
166 | { | ||
167 | struct xfs_cil *cil = log->l_cilp; | ||
168 | struct xfs_log_vec *old = lv->lv_item->li_lv; | ||
169 | struct xfs_cil_ctx *ctx = cil->xc_ctx; | ||
170 | int len; | ||
171 | int diff_iovecs; | ||
172 | int iclog_space; | ||
173 | |||
174 | if (old) { | ||
175 | /* existing lv on log item, space used is a delta */ | ||
176 | ASSERT(!list_empty(&item->li_cil)); | ||
177 | ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs); | ||
178 | |||
179 | len = lv->lv_buf_len - old->lv_buf_len; | ||
180 | diff_iovecs = lv->lv_niovecs - old->lv_niovecs; | ||
181 | kmem_free(old->lv_buf); | ||
182 | kmem_free(old); | ||
183 | } else { | ||
184 | /* new lv, must pin the log item */ | ||
185 | ASSERT(!lv->lv_item->li_lv); | ||
186 | ASSERT(list_empty(&item->li_cil)); | ||
187 | |||
188 | len = lv->lv_buf_len; | ||
189 | diff_iovecs = lv->lv_niovecs; | ||
190 | IOP_PIN(lv->lv_item); | ||
191 | |||
192 | } | ||
193 | len += diff_iovecs * sizeof(xlog_op_header_t); | ||
194 | |||
195 | /* attach new log vector to log item */ | ||
196 | lv->lv_item->li_lv = lv; | ||
197 | |||
198 | spin_lock(&cil->xc_cil_lock); | ||
199 | list_move_tail(&item->li_cil, &cil->xc_cil); | ||
200 | ctx->nvecs += diff_iovecs; | ||
201 | |||
202 | /* | ||
203 | * If this is the first time the item is being committed to the CIL, | ||
204 | * store the sequence number on the log item so we can tell | ||
205 | * in future commits whether this is the first checkpoint the item is | ||
206 | * being committed into. | ||
207 | */ | ||
208 | if (!item->li_seq) | ||
209 | item->li_seq = ctx->sequence; | ||
210 | |||
211 | /* | ||
212 | * Now transfer enough transaction reservation to the context ticket | ||
213 | * for the checkpoint. The context ticket is special - the unit | ||
214 | * reservation has to grow as well as the current reservation as we | ||
215 | * steal from tickets so we can correctly determine the space used | ||
216 | * during the transaction commit. | ||
217 | */ | ||
218 | if (ctx->ticket->t_curr_res == 0) { | ||
219 | /* first commit in checkpoint, steal the header reservation */ | ||
220 | ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len); | ||
221 | ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; | ||
222 | ticket->t_curr_res -= ctx->ticket->t_unit_res; | ||
223 | } | ||
224 | |||
225 | /* do we need space for more log record headers? */ | ||
226 | iclog_space = log->l_iclog_size - log->l_iclog_hsize; | ||
227 | if (len > 0 && (ctx->space_used / iclog_space != | ||
228 | (ctx->space_used + len) / iclog_space)) { | ||
229 | int hdrs; | ||
230 | |||
231 | hdrs = (len + iclog_space - 1) / iclog_space; | ||
232 | /* need to take into account split region headers, too */ | ||
233 | hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); | ||
234 | ctx->ticket->t_unit_res += hdrs; | ||
235 | ctx->ticket->t_curr_res += hdrs; | ||
236 | ticket->t_curr_res -= hdrs; | ||
237 | ASSERT(ticket->t_curr_res >= len); | ||
238 | } | ||
239 | ticket->t_curr_res -= len; | ||
240 | ctx->space_used += len; | ||
241 | |||
242 | spin_unlock(&cil->xc_cil_lock); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Format log item into a flat buffers | ||
247 | * | ||
248 | * For delayed logging, we need to hold a formatted buffer containing all the | ||
249 | * changes on the log item. This enables us to relog the item in memory and | ||
250 | * write it out asynchronously without needing to relock the object that was | ||
251 | * modified at the time it gets written into the iclog. | ||
252 | * | ||
253 | * This function builds a vector for the changes in each log item in the | ||
254 | * transaction. It then works out the length of the buffer needed for each log | ||
255 | * item, allocates them and formats the vector for the item into the buffer. | ||
256 | * The buffer is then attached to the log item are then inserted into the | ||
257 | * Committed Item List for tracking until the next checkpoint is written out. | ||
258 | * | ||
259 | * We don't set up region headers during this process; we simply copy the | ||
260 | * regions into the flat buffer. We can do this because we still have to do a | ||
261 | * formatting step to write the regions into the iclog buffer. Writing the | ||
262 | * ophdrs during the iclog write means that we can support splitting large | ||
263 | * regions across iclog boundares without needing a change in the format of the | ||
264 | * item/region encapsulation. | ||
265 | * | ||
266 | * Hence what we need to do now is change the rewrite the vector array to point | ||
267 | * to the copied region inside the buffer we just allocated. This allows us to | ||
268 | * format the regions into the iclog as though they are being formatted | ||
269 | * directly out of the objects themselves. | ||
270 | */ | ||
271 | static void | ||
272 | xlog_cil_format_items( | ||
273 | struct log *log, | ||
274 | struct xfs_log_vec *log_vector, | ||
275 | struct xlog_ticket *ticket, | ||
276 | xfs_lsn_t *start_lsn) | ||
277 | { | ||
278 | struct xfs_log_vec *lv; | ||
279 | |||
280 | if (start_lsn) | ||
281 | *start_lsn = log->l_cilp->xc_ctx->sequence; | ||
282 | |||
283 | ASSERT(log_vector); | ||
284 | for (lv = log_vector; lv; lv = lv->lv_next) { | ||
285 | void *ptr; | ||
286 | int index; | ||
287 | int len = 0; | ||
288 | |||
289 | /* build the vector array and calculate it's length */ | ||
290 | IOP_FORMAT(lv->lv_item, lv->lv_iovecp); | ||
291 | for (index = 0; index < lv->lv_niovecs; index++) | ||
292 | len += lv->lv_iovecp[index].i_len; | ||
293 | |||
294 | lv->lv_buf_len = len; | ||
295 | lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS); | ||
296 | ptr = lv->lv_buf; | ||
297 | |||
298 | for (index = 0; index < lv->lv_niovecs; index++) { | ||
299 | struct xfs_log_iovec *vec = &lv->lv_iovecp[index]; | ||
300 | |||
301 | memcpy(ptr, vec->i_addr, vec->i_len); | ||
302 | vec->i_addr = ptr; | ||
303 | ptr += vec->i_len; | ||
304 | } | ||
305 | ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); | ||
306 | |||
307 | xlog_cil_insert(log, ticket, lv->lv_item, lv); | ||
308 | } | ||
309 | } | ||
310 | |||
311 | static void | ||
312 | xlog_cil_free_logvec( | ||
313 | struct xfs_log_vec *log_vector) | ||
314 | { | ||
315 | struct xfs_log_vec *lv; | ||
316 | |||
317 | for (lv = log_vector; lv; ) { | ||
318 | struct xfs_log_vec *next = lv->lv_next; | ||
319 | kmem_free(lv->lv_buf); | ||
320 | kmem_free(lv); | ||
321 | lv = next; | ||
322 | } | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Commit a transaction with the given vector to the Committed Item List. | ||
327 | * | ||
328 | * To do this, we need to format the item, pin it in memory if required and | ||
329 | * account for the space used by the transaction. Once we have done that we | ||
330 | * need to release the unused reservation for the transaction, attach the | ||
331 | * transaction to the checkpoint context so we carry the busy extents through | ||
332 | * to checkpoint completion, and then unlock all the items in the transaction. | ||
333 | * | ||
334 | * For more specific information about the order of operations in | ||
335 | * xfs_log_commit_cil() please refer to the comments in | ||
336 | * xfs_trans_commit_iclog(). | ||
337 | * | ||
338 | * Called with the context lock already held in read mode to lock out | ||
339 | * background commit, returns without it held once background commits are | ||
340 | * allowed again. | ||
341 | */ | ||
342 | int | ||
343 | xfs_log_commit_cil( | ||
344 | struct xfs_mount *mp, | ||
345 | struct xfs_trans *tp, | ||
346 | struct xfs_log_vec *log_vector, | ||
347 | xfs_lsn_t *commit_lsn, | ||
348 | int flags) | ||
349 | { | ||
350 | struct log *log = mp->m_log; | ||
351 | int log_flags = 0; | ||
352 | int push = 0; | ||
353 | |||
354 | if (flags & XFS_TRANS_RELEASE_LOG_RES) | ||
355 | log_flags = XFS_LOG_REL_PERM_RESERV; | ||
356 | |||
357 | if (XLOG_FORCED_SHUTDOWN(log)) { | ||
358 | xlog_cil_free_logvec(log_vector); | ||
359 | return XFS_ERROR(EIO); | ||
360 | } | ||
361 | |||
362 | /* lock out background commit */ | ||
363 | down_read(&log->l_cilp->xc_ctx_lock); | ||
364 | xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); | ||
365 | |||
366 | /* check we didn't blow the reservation */ | ||
367 | if (tp->t_ticket->t_curr_res < 0) | ||
368 | xlog_print_tic_res(log->l_mp, tp->t_ticket); | ||
369 | |||
370 | /* attach the transaction to the CIL if it has any busy extents */ | ||
371 | if (!list_empty(&tp->t_busy)) { | ||
372 | spin_lock(&log->l_cilp->xc_cil_lock); | ||
373 | list_splice_init(&tp->t_busy, | ||
374 | &log->l_cilp->xc_ctx->busy_extents); | ||
375 | spin_unlock(&log->l_cilp->xc_cil_lock); | ||
376 | } | ||
377 | |||
378 | tp->t_commit_lsn = *commit_lsn; | ||
379 | xfs_log_done(mp, tp->t_ticket, NULL, log_flags); | ||
380 | xfs_trans_unreserve_and_mod_sb(tp); | ||
381 | |||
382 | /* check for background commit before unlock */ | ||
383 | if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) | ||
384 | push = 1; | ||
385 | up_read(&log->l_cilp->xc_ctx_lock); | ||
386 | |||
387 | /* | ||
388 | * We need to push CIL every so often so we don't cache more than we | ||
389 | * can fit in the log. The limit really is that a checkpoint can't be | ||
390 | * more than half the log (the current checkpoint is not allowed to | ||
391 | * overwrite the previous checkpoint), but commit latency and memory | ||
392 | * usage limit this to a smaller size in most cases. | ||
393 | */ | ||
394 | if (push) | ||
395 | xlog_cil_push(log, 0); | ||
396 | return 0; | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Mark all items committed and clear busy extents. We free the log vector | ||
401 | * chains in a separate pass so that we unpin the log items as quickly as | ||
402 | * possible. | ||
403 | */ | ||
404 | static void | ||
405 | xlog_cil_committed( | ||
406 | void *args, | ||
407 | int abort) | ||
408 | { | ||
409 | struct xfs_cil_ctx *ctx = args; | ||
410 | struct xfs_log_vec *lv; | ||
411 | int abortflag = abort ? XFS_LI_ABORTED : 0; | ||
412 | struct xfs_busy_extent *busyp, *n; | ||
413 | |||
414 | /* unpin all the log items */ | ||
415 | for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { | ||
416 | xfs_trans_item_committed(lv->lv_item, ctx->start_lsn, | ||
417 | abortflag); | ||
418 | } | ||
419 | |||
420 | list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) | ||
421 | xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); | ||
422 | |||
423 | spin_lock(&ctx->cil->xc_cil_lock); | ||
424 | list_del(&ctx->committing); | ||
425 | spin_unlock(&ctx->cil->xc_cil_lock); | ||
426 | |||
427 | xlog_cil_free_logvec(ctx->lv_chain); | ||
428 | kmem_free(ctx); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * Push the Committed Item List to the log. If the push_now flag is not set, | ||
433 | * then it is a background flush and so we can chose to ignore it. | ||
434 | */ | ||
435 | int | ||
436 | xlog_cil_push( | ||
437 | struct log *log, | ||
438 | int push_now) | ||
439 | { | ||
440 | struct xfs_cil *cil = log->l_cilp; | ||
441 | struct xfs_log_vec *lv; | ||
442 | struct xfs_cil_ctx *ctx; | ||
443 | struct xfs_cil_ctx *new_ctx; | ||
444 | struct xlog_in_core *commit_iclog; | ||
445 | struct xlog_ticket *tic; | ||
446 | int num_lv; | ||
447 | int num_iovecs; | ||
448 | int len; | ||
449 | int error = 0; | ||
450 | struct xfs_trans_header thdr; | ||
451 | struct xfs_log_iovec lhdr; | ||
452 | struct xfs_log_vec lvhdr = { NULL }; | ||
453 | xfs_lsn_t commit_lsn; | ||
454 | |||
455 | if (!cil) | ||
456 | return 0; | ||
457 | |||
458 | new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); | ||
459 | new_ctx->ticket = xlog_cil_ticket_alloc(log); | ||
460 | |||
461 | /* lock out transaction commit, but don't block on background push */ | ||
462 | if (!down_write_trylock(&cil->xc_ctx_lock)) { | ||
463 | if (!push_now) | ||
464 | goto out_free_ticket; | ||
465 | down_write(&cil->xc_ctx_lock); | ||
466 | } | ||
467 | ctx = cil->xc_ctx; | ||
468 | |||
469 | /* check if we've anything to push */ | ||
470 | if (list_empty(&cil->xc_cil)) | ||
471 | goto out_skip; | ||
472 | |||
473 | /* check for spurious background flush */ | ||
474 | if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) | ||
475 | goto out_skip; | ||
476 | |||
477 | /* | ||
478 | * pull all the log vectors off the items in the CIL, and | ||
479 | * remove the items from the CIL. We don't need the CIL lock | ||
480 | * here because it's only needed on the transaction commit | ||
481 | * side which is currently locked out by the flush lock. | ||
482 | */ | ||
483 | lv = NULL; | ||
484 | num_lv = 0; | ||
485 | num_iovecs = 0; | ||
486 | len = 0; | ||
487 | while (!list_empty(&cil->xc_cil)) { | ||
488 | struct xfs_log_item *item; | ||
489 | int i; | ||
490 | |||
491 | item = list_first_entry(&cil->xc_cil, | ||
492 | struct xfs_log_item, li_cil); | ||
493 | list_del_init(&item->li_cil); | ||
494 | if (!ctx->lv_chain) | ||
495 | ctx->lv_chain = item->li_lv; | ||
496 | else | ||
497 | lv->lv_next = item->li_lv; | ||
498 | lv = item->li_lv; | ||
499 | item->li_lv = NULL; | ||
500 | |||
501 | num_lv++; | ||
502 | num_iovecs += lv->lv_niovecs; | ||
503 | for (i = 0; i < lv->lv_niovecs; i++) | ||
504 | len += lv->lv_iovecp[i].i_len; | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * initialise the new context and attach it to the CIL. Then attach | ||
509 | * the current context to the CIL committing lsit so it can be found | ||
510 | * during log forces to extract the commit lsn of the sequence that | ||
511 | * needs to be forced. | ||
512 | */ | ||
513 | INIT_LIST_HEAD(&new_ctx->committing); | ||
514 | INIT_LIST_HEAD(&new_ctx->busy_extents); | ||
515 | new_ctx->sequence = ctx->sequence + 1; | ||
516 | new_ctx->cil = cil; | ||
517 | cil->xc_ctx = new_ctx; | ||
518 | |||
519 | /* | ||
520 | * The switch is now done, so we can drop the context lock and move out | ||
521 | * of a shared context. We can't just go straight to the commit record, | ||
522 | * though - we need to synchronise with previous and future commits so | ||
523 | * that the commit records are correctly ordered in the log to ensure | ||
524 | * that we process items during log IO completion in the correct order. | ||
525 | * | ||
526 | * For example, if we get an EFI in one checkpoint and the EFD in the | ||
527 | * next (e.g. due to log forces), we do not want the checkpoint with | ||
528 | * the EFD to be committed before the checkpoint with the EFI. Hence | ||
529 | * we must strictly order the commit records of the checkpoints so | ||
530 | * that: a) the checkpoint callbacks are attached to the iclogs in the | ||
531 | * correct order; and b) the checkpoints are replayed in correct order | ||
532 | * in log recovery. | ||
533 | * | ||
534 | * Hence we need to add this context to the committing context list so | ||
535 | * that higher sequences will wait for us to write out a commit record | ||
536 | * before they do. | ||
537 | */ | ||
538 | spin_lock(&cil->xc_cil_lock); | ||
539 | list_add(&ctx->committing, &cil->xc_committing); | ||
540 | spin_unlock(&cil->xc_cil_lock); | ||
541 | up_write(&cil->xc_ctx_lock); | ||
542 | |||
543 | /* | ||
544 | * Build a checkpoint transaction header and write it to the log to | ||
545 | * begin the transaction. We need to account for the space used by the | ||
546 | * transaction header here as it is not accounted for in xlog_write(). | ||
547 | * | ||
548 | * The LSN we need to pass to the log items on transaction commit is | ||
549 | * the LSN reported by the first log vector write. If we use the commit | ||
550 | * record lsn then we can move the tail beyond the grant write head. | ||
551 | */ | ||
552 | tic = ctx->ticket; | ||
553 | thdr.th_magic = XFS_TRANS_HEADER_MAGIC; | ||
554 | thdr.th_type = XFS_TRANS_CHECKPOINT; | ||
555 | thdr.th_tid = tic->t_tid; | ||
556 | thdr.th_num_items = num_iovecs; | ||
557 | lhdr.i_addr = (xfs_caddr_t)&thdr; | ||
558 | lhdr.i_len = sizeof(xfs_trans_header_t); | ||
559 | lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; | ||
560 | tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); | ||
561 | |||
562 | lvhdr.lv_niovecs = 1; | ||
563 | lvhdr.lv_iovecp = &lhdr; | ||
564 | lvhdr.lv_next = ctx->lv_chain; | ||
565 | |||
566 | error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); | ||
567 | if (error) | ||
568 | goto out_abort; | ||
569 | |||
570 | /* | ||
571 | * now that we've written the checkpoint into the log, strictly | ||
572 | * order the commit records so replay will get them in the right order. | ||
573 | */ | ||
574 | restart: | ||
575 | spin_lock(&cil->xc_cil_lock); | ||
576 | list_for_each_entry(new_ctx, &cil->xc_committing, committing) { | ||
577 | /* | ||
578 | * Higher sequences will wait for this one so skip them. | ||
579 | * Don't wait for own own sequence, either. | ||
580 | */ | ||
581 | if (new_ctx->sequence >= ctx->sequence) | ||
582 | continue; | ||
583 | if (!new_ctx->commit_lsn) { | ||
584 | /* | ||
585 | * It is still being pushed! Wait for the push to | ||
586 | * complete, then start again from the beginning. | ||
587 | */ | ||
588 | sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); | ||
589 | goto restart; | ||
590 | } | ||
591 | } | ||
592 | spin_unlock(&cil->xc_cil_lock); | ||
593 | |||
594 | commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); | ||
595 | if (error || commit_lsn == -1) | ||
596 | goto out_abort; | ||
597 | |||
598 | /* attach all the transactions w/ busy extents to iclog */ | ||
599 | ctx->log_cb.cb_func = xlog_cil_committed; | ||
600 | ctx->log_cb.cb_arg = ctx; | ||
601 | error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); | ||
602 | if (error) | ||
603 | goto out_abort; | ||
604 | |||
605 | /* | ||
606 | * now the checkpoint commit is complete and we've attached the | ||
607 | * callbacks to the iclog we can assign the commit LSN to the context | ||
608 | * and wake up anyone who is waiting for the commit to complete. | ||
609 | */ | ||
610 | spin_lock(&cil->xc_cil_lock); | ||
611 | ctx->commit_lsn = commit_lsn; | ||
612 | sv_broadcast(&cil->xc_commit_wait); | ||
613 | spin_unlock(&cil->xc_cil_lock); | ||
614 | |||
615 | /* release the hounds! */ | ||
616 | return xfs_log_release_iclog(log->l_mp, commit_iclog); | ||
617 | |||
618 | out_skip: | ||
619 | up_write(&cil->xc_ctx_lock); | ||
620 | out_free_ticket: | ||
621 | xfs_log_ticket_put(new_ctx->ticket); | ||
622 | kmem_free(new_ctx); | ||
623 | return 0; | ||
624 | |||
625 | out_abort: | ||
626 | xlog_cil_committed(ctx, XFS_LI_ABORTED); | ||
627 | return XFS_ERROR(EIO); | ||
628 | } | ||
629 | |||
630 | /* | ||
631 | * Conditionally push the CIL based on the sequence passed in. | ||
632 | * | ||
633 | * We only need to push if we haven't already pushed the sequence | ||
634 | * number given. Hence the only time we will trigger a push here is | ||
635 | * if the push sequence is the same as the current context. | ||
636 | * | ||
637 | * We return the current commit lsn to allow the callers to determine if a | ||
638 | * iclog flush is necessary following this call. | ||
639 | * | ||
640 | * XXX: Initially, just push the CIL unconditionally and return whatever | ||
641 | * commit lsn is there. It'll be empty, so this is broken for now. | ||
642 | */ | ||
643 | xfs_lsn_t | ||
644 | xlog_cil_push_lsn( | ||
645 | struct log *log, | ||
646 | xfs_lsn_t push_seq) | ||
647 | { | ||
648 | struct xfs_cil *cil = log->l_cilp; | ||
649 | struct xfs_cil_ctx *ctx; | ||
650 | xfs_lsn_t commit_lsn = NULLCOMMITLSN; | ||
651 | |||
652 | restart: | ||
653 | down_write(&cil->xc_ctx_lock); | ||
654 | ASSERT(push_seq <= cil->xc_ctx->sequence); | ||
655 | |||
656 | /* check to see if we need to force out the current context */ | ||
657 | if (push_seq == cil->xc_ctx->sequence) { | ||
658 | up_write(&cil->xc_ctx_lock); | ||
659 | xlog_cil_push(log, 1); | ||
660 | goto restart; | ||
661 | } | ||
662 | |||
663 | /* | ||
664 | * See if we can find a previous sequence still committing. | ||
665 | * We can drop the flush lock as soon as we have the cil lock | ||
666 | * because we are now only comparing contexts protected by | ||
667 | * the cil lock. | ||
668 | * | ||
669 | * We need to wait for all previous sequence commits to complete | ||
670 | * before allowing the force of push_seq to go ahead. Hence block | ||
671 | * on commits for those as well. | ||
672 | */ | ||
673 | spin_lock(&cil->xc_cil_lock); | ||
674 | up_write(&cil->xc_ctx_lock); | ||
675 | list_for_each_entry(ctx, &cil->xc_committing, committing) { | ||
676 | if (ctx->sequence > push_seq) | ||
677 | continue; | ||
678 | if (!ctx->commit_lsn) { | ||
679 | /* | ||
680 | * It is still being pushed! Wait for the push to | ||
681 | * complete, then start again from the beginning. | ||
682 | */ | ||
683 | sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); | ||
684 | goto restart; | ||
685 | } | ||
686 | if (ctx->sequence != push_seq) | ||
687 | continue; | ||
688 | /* found it! */ | ||
689 | commit_lsn = ctx->commit_lsn; | ||
690 | } | ||
691 | spin_unlock(&cil->xc_cil_lock); | ||
692 | return commit_lsn; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * Check if the current log item was first committed in this sequence. | ||
697 | * We can't rely on just the log item being in the CIL, we have to check | ||
698 | * the recorded commit sequence number. | ||
699 | * | ||
700 | * Note: for this to be used in a non-racy manner, it has to be called with | ||
701 | * CIL flushing locked out. As a result, it should only be used during the | ||
702 | * transaction commit process when deciding what to format into the item. | ||
703 | */ | ||
704 | bool | ||
705 | xfs_log_item_in_current_chkpt( | ||
706 | struct xfs_log_item *lip) | ||
707 | { | ||
708 | struct xfs_cil_ctx *ctx; | ||
709 | |||
710 | if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG)) | ||
711 | return false; | ||
712 | if (list_empty(&lip->li_cil)) | ||
713 | return false; | ||
714 | |||
715 | ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; | ||
716 | |||
717 | /* | ||
718 | * li_seq is written on the first commit of a log item to record the | ||
719 | * first checkpoint it is written to. Hence if it is different to the | ||
720 | * current sequence, we're in a new checkpoint. | ||
721 | */ | ||
722 | if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) | ||
723 | return false; | ||
724 | return true; | ||
725 | } | ||
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 9cf695154451..8c072618965c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h | |||
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i) | |||
152 | #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ | 152 | #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ |
153 | #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being | 153 | #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being |
154 | shutdown */ | 154 | shutdown */ |
155 | typedef __uint32_t xlog_tid_t; | ||
156 | |||
157 | 155 | ||
158 | #ifdef __KERNEL__ | 156 | #ifdef __KERNEL__ |
159 | /* | 157 | /* |
@@ -379,6 +377,99 @@ typedef struct xlog_in_core { | |||
379 | } xlog_in_core_t; | 377 | } xlog_in_core_t; |
380 | 378 | ||
381 | /* | 379 | /* |
380 | * The CIL context is used to aggregate per-transaction details as well be | ||
381 | * passed to the iclog for checkpoint post-commit processing. After being | ||
382 | * passed to the iclog, another context needs to be allocated for tracking the | ||
383 | * next set of transactions to be aggregated into a checkpoint. | ||
384 | */ | ||
385 | struct xfs_cil; | ||
386 | |||
387 | struct xfs_cil_ctx { | ||
388 | struct xfs_cil *cil; | ||
389 | xfs_lsn_t sequence; /* chkpt sequence # */ | ||
390 | xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ | ||
391 | xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ | ||
392 | struct xlog_ticket *ticket; /* chkpt ticket */ | ||
393 | int nvecs; /* number of regions */ | ||
394 | int space_used; /* aggregate size of regions */ | ||
395 | struct list_head busy_extents; /* busy extents in chkpt */ | ||
396 | struct xfs_log_vec *lv_chain; /* logvecs being pushed */ | ||
397 | xfs_log_callback_t log_cb; /* completion callback hook. */ | ||
398 | struct list_head committing; /* ctx committing list */ | ||
399 | }; | ||
400 | |||
401 | /* | ||
402 | * Committed Item List structure | ||
403 | * | ||
404 | * This structure is used to track log items that have been committed but not | ||
405 | * yet written into the log. It is used only when the delayed logging mount | ||
406 | * option is enabled. | ||
407 | * | ||
408 | * This structure tracks the list of committing checkpoint contexts so | ||
409 | * we can avoid the problem of having to hold out new transactions during a | ||
410 | * flush until we have a the commit record LSN of the checkpoint. We can | ||
411 | * traverse the list of committing contexts in xlog_cil_push_lsn() to find a | ||
412 | * sequence match and extract the commit LSN directly from there. If the | ||
413 | * checkpoint is still in the process of committing, we can block waiting for | ||
414 | * the commit LSN to be determined as well. This should make synchronous | ||
415 | * operations almost as efficient as the old logging methods. | ||
416 | */ | ||
417 | struct xfs_cil { | ||
418 | struct log *xc_log; | ||
419 | struct list_head xc_cil; | ||
420 | spinlock_t xc_cil_lock; | ||
421 | struct xfs_cil_ctx *xc_ctx; | ||
422 | struct rw_semaphore xc_ctx_lock; | ||
423 | struct list_head xc_committing; | ||
424 | sv_t xc_commit_wait; | ||
425 | }; | ||
426 | |||
427 | /* | ||
428 | * The amount of log space we should the CIL to aggregate is difficult to size. | ||
429 | * Whatever we chose we have to make we can get a reservation for the log space | ||
430 | * effectively, that it is large enough to capture sufficient relogging to | ||
431 | * reduce log buffer IO significantly, but it is not too large for the log or | ||
432 | * induces too much latency when writing out through the iclogs. We track both | ||
433 | * space consumed and the number of vectors in the checkpoint context, so we | ||
434 | * need to decide which to use for limiting. | ||
435 | * | ||
436 | * Every log buffer we write out during a push needs a header reserved, which | ||
437 | * is at least one sector and more for v2 logs. Hence we need a reservation of | ||
438 | * at least 512 bytes per 32k of log space just for the LR headers. That means | ||
439 | * 16KB of reservation per megabyte of delayed logging space we will consume, | ||
440 | * plus various headers. The number of headers will vary based on the num of | ||
441 | * io vectors, so limiting on a specific number of vectors is going to result | ||
442 | * in transactions of varying size. IOWs, it is more consistent to track and | ||
443 | * limit space consumed in the log rather than by the number of objects being | ||
444 | * logged in order to prevent checkpoint ticket overruns. | ||
445 | * | ||
446 | * Further, use of static reservations through the log grant mechanism is | ||
447 | * problematic. It introduces a lot of complexity (e.g. reserve grant vs write | ||
448 | * grant) and a significant deadlock potential because regranting write space | ||
449 | * can block on log pushes. Hence if we have to regrant log space during a log | ||
450 | * push, we can deadlock. | ||
451 | * | ||
452 | * However, we can avoid this by use of a dynamic "reservation stealing" | ||
453 | * technique during transaction commit whereby unused reservation space in the | ||
454 | * transaction ticket is transferred to the CIL ctx commit ticket to cover the | ||
455 | * space needed by the checkpoint transaction. This means that we never need to | ||
456 | * specifically reserve space for the CIL checkpoint transaction, nor do we | ||
457 | * need to regrant space once the checkpoint completes. This also means the | ||
458 | * checkpoint transaction ticket is specific to the checkpoint context, rather | ||
459 | * than the CIL itself. | ||
460 | * | ||
461 | * With dynamic reservations, we can basically make up arbitrary limits for the | ||
462 | * checkpoint size so long as they don't violate any other size rules. Hence | ||
463 | * the initial maximum size for the checkpoint transaction will be set to a | ||
464 | * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit | ||
465 | * right now based on the latency of writing out a large amount of data through | ||
466 | * the circular iclog buffers. | ||
467 | */ | ||
468 | |||
469 | #define XLOG_CIL_SPACE_LIMIT(log) \ | ||
470 | (min((log->l_logsize >> 2), (8 * 1024 * 1024))) | ||
471 | |||
472 | /* | ||
382 | * The reservation head lsn is not made up of a cycle number and block number. | 473 | * The reservation head lsn is not made up of a cycle number and block number. |
383 | * Instead, it uses a cycle number and byte number. Logs don't expect to | 474 | * Instead, it uses a cycle number and byte number. Logs don't expect to |
384 | * overflow 31 bits worth of byte offset, so using a byte number will mean | 475 | * overflow 31 bits worth of byte offset, so using a byte number will mean |
@@ -388,6 +479,7 @@ typedef struct log { | |||
388 | /* The following fields don't need locking */ | 479 | /* The following fields don't need locking */ |
389 | struct xfs_mount *l_mp; /* mount point */ | 480 | struct xfs_mount *l_mp; /* mount point */ |
390 | struct xfs_ail *l_ailp; /* AIL log is working with */ | 481 | struct xfs_ail *l_ailp; /* AIL log is working with */ |
482 | struct xfs_cil *l_cilp; /* CIL log is working with */ | ||
391 | struct xfs_buf *l_xbuf; /* extra buffer for log | 483 | struct xfs_buf *l_xbuf; /* extra buffer for log |
392 | * wrapping */ | 484 | * wrapping */ |
393 | struct xfs_buftarg *l_targ; /* buftarg of log */ | 485 | struct xfs_buftarg *l_targ; /* buftarg of log */ |
@@ -438,14 +530,17 @@ typedef struct log { | |||
438 | 530 | ||
439 | #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) | 531 | #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) |
440 | 532 | ||
441 | |||
442 | /* common routines */ | 533 | /* common routines */ |
443 | extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); | 534 | extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); |
444 | extern int xlog_recover(xlog_t *log); | 535 | extern int xlog_recover(xlog_t *log); |
445 | extern int xlog_recover_finish(xlog_t *log); | 536 | extern int xlog_recover_finish(xlog_t *log); |
446 | extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); | 537 | extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); |
447 | 538 | ||
448 | extern kmem_zone_t *xfs_log_ticket_zone; | 539 | extern kmem_zone_t *xfs_log_ticket_zone; |
540 | struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes, | ||
541 | int count, char client, uint xflags, | ||
542 | int alloc_flags); | ||
543 | |||
449 | 544 | ||
450 | static inline void | 545 | static inline void |
451 | xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) | 546 | xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) |
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) | |||
455 | *off += bytes; | 550 | *off += bytes; |
456 | } | 551 | } |
457 | 552 | ||
553 | void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); | ||
554 | int xlog_write(struct log *log, struct xfs_log_vec *log_vector, | ||
555 | struct xlog_ticket *tic, xfs_lsn_t *start_lsn, | ||
556 | xlog_in_core_t **commit_iclog, uint flags); | ||
557 | |||
558 | /* | ||
559 | * Committed Item List interfaces | ||
560 | */ | ||
561 | int xlog_cil_init(struct log *log); | ||
562 | void xlog_cil_init_post_recovery(struct log *log); | ||
563 | void xlog_cil_destroy(struct log *log); | ||
564 | |||
565 | int xlog_cil_push(struct log *log, int push_now); | ||
566 | xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); | ||
567 | |||
458 | /* | 568 | /* |
459 | * Unmount record type is used as a pseudo transaction type for the ticket. | 569 | * Unmount record type is used as a pseudo transaction type for the ticket. |
460 | * It's value must be outside the range of XFS_TRANS_* values. | 570 | * It's value must be outside the range of XFS_TRANS_* values. |
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 0de08e366315..14a69aec2c0b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c | |||
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans( | |||
1576 | 1576 | ||
1577 | switch (ITEM_TYPE(item)) { | 1577 | switch (ITEM_TYPE(item)) { |
1578 | case XFS_LI_BUF: | 1578 | case XFS_LI_BUF: |
1579 | if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { | 1579 | if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { |
1580 | trace_xfs_log_recover_item_reorder_head(log, | 1580 | trace_xfs_log_recover_item_reorder_head(log, |
1581 | trans, item, pass); | 1581 | trans, item, pass); |
1582 | list_move(&item->ri_list, &trans->r_itemq); | 1582 | list_move(&item->ri_list, &trans->r_itemq); |
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1( | |||
1638 | /* | 1638 | /* |
1639 | * If this isn't a cancel buffer item, then just return. | 1639 | * If this isn't a cancel buffer item, then just return. |
1640 | */ | 1640 | */ |
1641 | if (!(flags & XFS_BLI_CANCEL)) { | 1641 | if (!(flags & XFS_BLF_CANCEL)) { |
1642 | trace_xfs_log_recover_buf_not_cancel(log, buf_f); | 1642 | trace_xfs_log_recover_buf_not_cancel(log, buf_f); |
1643 | return; | 1643 | return; |
1644 | } | 1644 | } |
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1( | |||
1696 | * Check to see whether the buffer being recovered has a corresponding | 1696 | * Check to see whether the buffer being recovered has a corresponding |
1697 | * entry in the buffer cancel record table. If it does then return 1 | 1697 | * entry in the buffer cancel record table. If it does then return 1 |
1698 | * so that it will be cancelled, otherwise return 0. If the buffer is | 1698 | * so that it will be cancelled, otherwise return 0. If the buffer is |
1699 | * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement | 1699 | * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement |
1700 | * the refcount on the entry in the table and remove it from the table | 1700 | * the refcount on the entry in the table and remove it from the table |
1701 | * if this is the last reference. | 1701 | * if this is the last reference. |
1702 | * | 1702 | * |
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled( | |||
1721 | * There is nothing in the table built in pass one, | 1721 | * There is nothing in the table built in pass one, |
1722 | * so this buffer must not be cancelled. | 1722 | * so this buffer must not be cancelled. |
1723 | */ | 1723 | */ |
1724 | ASSERT(!(flags & XFS_BLI_CANCEL)); | 1724 | ASSERT(!(flags & XFS_BLF_CANCEL)); |
1725 | return 0; | 1725 | return 0; |
1726 | } | 1726 | } |
1727 | 1727 | ||
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled( | |||
1733 | * There is no corresponding entry in the table built | 1733 | * There is no corresponding entry in the table built |
1734 | * in pass one, so this buffer has not been cancelled. | 1734 | * in pass one, so this buffer has not been cancelled. |
1735 | */ | 1735 | */ |
1736 | ASSERT(!(flags & XFS_BLI_CANCEL)); | 1736 | ASSERT(!(flags & XFS_BLF_CANCEL)); |
1737 | return 0; | 1737 | return 0; |
1738 | } | 1738 | } |
1739 | 1739 | ||
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled( | |||
1752 | * one in the table and remove it if this is the | 1752 | * one in the table and remove it if this is the |
1753 | * last reference. | 1753 | * last reference. |
1754 | */ | 1754 | */ |
1755 | if (flags & XFS_BLI_CANCEL) { | 1755 | if (flags & XFS_BLF_CANCEL) { |
1756 | bcp->bc_refcount--; | 1756 | bcp->bc_refcount--; |
1757 | if (bcp->bc_refcount == 0) { | 1757 | if (bcp->bc_refcount == 0) { |
1758 | if (prevp == NULL) { | 1758 | if (prevp == NULL) { |
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled( | |||
1772 | * We didn't find a corresponding entry in the table, so | 1772 | * We didn't find a corresponding entry in the table, so |
1773 | * return 0 so that the buffer is NOT cancelled. | 1773 | * return 0 so that the buffer is NOT cancelled. |
1774 | */ | 1774 | */ |
1775 | ASSERT(!(flags & XFS_BLI_CANCEL)); | 1775 | ASSERT(!(flags & XFS_BLF_CANCEL)); |
1776 | return 0; | 1776 | return 0; |
1777 | } | 1777 | } |
1778 | 1778 | ||
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer( | |||
1874 | nbits = xfs_contig_bits(data_map, map_size, | 1874 | nbits = xfs_contig_bits(data_map, map_size, |
1875 | bit); | 1875 | bit); |
1876 | ASSERT(nbits > 0); | 1876 | ASSERT(nbits > 0); |
1877 | reg_buf_offset = bit << XFS_BLI_SHIFT; | 1877 | reg_buf_offset = bit << XFS_BLF_SHIFT; |
1878 | reg_buf_bytes = nbits << XFS_BLI_SHIFT; | 1878 | reg_buf_bytes = nbits << XFS_BLF_SHIFT; |
1879 | item_index++; | 1879 | item_index++; |
1880 | } | 1880 | } |
1881 | 1881 | ||
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer( | |||
1889 | } | 1889 | } |
1890 | 1890 | ||
1891 | ASSERT(item->ri_buf[item_index].i_addr != NULL); | 1891 | ASSERT(item->ri_buf[item_index].i_addr != NULL); |
1892 | ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); | 1892 | ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); |
1893 | ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); | 1893 | ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); |
1894 | 1894 | ||
1895 | /* | 1895 | /* |
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer( | |||
1955 | nbits = xfs_contig_bits(data_map, map_size, bit); | 1955 | nbits = xfs_contig_bits(data_map, map_size, bit); |
1956 | ASSERT(nbits > 0); | 1956 | ASSERT(nbits > 0); |
1957 | ASSERT(item->ri_buf[i].i_addr != NULL); | 1957 | ASSERT(item->ri_buf[i].i_addr != NULL); |
1958 | ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); | 1958 | ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); |
1959 | ASSERT(XFS_BUF_COUNT(bp) >= | 1959 | ASSERT(XFS_BUF_COUNT(bp) >= |
1960 | ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); | 1960 | ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); |
1961 | 1961 | ||
1962 | /* | 1962 | /* |
1963 | * Do a sanity check if this is a dquot buffer. Just checking | 1963 | * Do a sanity check if this is a dquot buffer. Just checking |
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer( | |||
1966 | */ | 1966 | */ |
1967 | error = 0; | 1967 | error = 0; |
1968 | if (buf_f->blf_flags & | 1968 | if (buf_f->blf_flags & |
1969 | (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { | 1969 | (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { |
1970 | if (item->ri_buf[i].i_addr == NULL) { | 1970 | if (item->ri_buf[i].i_addr == NULL) { |
1971 | cmn_err(CE_ALERT, | 1971 | cmn_err(CE_ALERT, |
1972 | "XFS: NULL dquot in %s.", __func__); | 1972 | "XFS: NULL dquot in %s.", __func__); |
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer( | |||
1987 | } | 1987 | } |
1988 | 1988 | ||
1989 | memcpy(xfs_buf_offset(bp, | 1989 | memcpy(xfs_buf_offset(bp, |
1990 | (uint)bit << XFS_BLI_SHIFT), /* dest */ | 1990 | (uint)bit << XFS_BLF_SHIFT), /* dest */ |
1991 | item->ri_buf[i].i_addr, /* source */ | 1991 | item->ri_buf[i].i_addr, /* source */ |
1992 | nbits<<XFS_BLI_SHIFT); /* length */ | 1992 | nbits<<XFS_BLF_SHIFT); /* length */ |
1993 | next: | 1993 | next: |
1994 | i++; | 1994 | i++; |
1995 | bit += nbits; | 1995 | bit += nbits; |
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer( | |||
2148 | } | 2148 | } |
2149 | 2149 | ||
2150 | type = 0; | 2150 | type = 0; |
2151 | if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) | 2151 | if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) |
2152 | type |= XFS_DQ_USER; | 2152 | type |= XFS_DQ_USER; |
2153 | if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) | 2153 | if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) |
2154 | type |= XFS_DQ_PROJ; | 2154 | type |= XFS_DQ_PROJ; |
2155 | if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) | 2155 | if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) |
2156 | type |= XFS_DQ_GROUP; | 2156 | type |= XFS_DQ_GROUP; |
2157 | /* | 2157 | /* |
2158 | * This type of quotas was turned off, so ignore this buffer | 2158 | * This type of quotas was turned off, so ignore this buffer |
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer( | |||
2173 | * here which overlaps that may be stale. | 2173 | * here which overlaps that may be stale. |
2174 | * | 2174 | * |
2175 | * When meta-data buffers are freed at run time we log a buffer item | 2175 | * When meta-data buffers are freed at run time we log a buffer item |
2176 | * with the XFS_BLI_CANCEL bit set to indicate that previous copies | 2176 | * with the XFS_BLF_CANCEL bit set to indicate that previous copies |
2177 | * of the buffer in the log should not be replayed at recovery time. | 2177 | * of the buffer in the log should not be replayed at recovery time. |
2178 | * This is so that if the blocks covered by the buffer are reused for | 2178 | * This is so that if the blocks covered by the buffer are reused for |
2179 | * file data before we crash we don't end up replaying old, freed | 2179 | * file data before we crash we don't end up replaying old, freed |
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans( | |||
2207 | if (pass == XLOG_RECOVER_PASS1) { | 2207 | if (pass == XLOG_RECOVER_PASS1) { |
2208 | /* | 2208 | /* |
2209 | * In this pass we're only looking for buf items | 2209 | * In this pass we're only looking for buf items |
2210 | * with the XFS_BLI_CANCEL bit set. | 2210 | * with the XFS_BLF_CANCEL bit set. |
2211 | */ | 2211 | */ |
2212 | xlog_recover_do_buffer_pass1(log, buf_f); | 2212 | xlog_recover_do_buffer_pass1(log, buf_f); |
2213 | return 0; | 2213 | return 0; |
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans( | |||
2244 | 2244 | ||
2245 | mp = log->l_mp; | 2245 | mp = log->l_mp; |
2246 | buf_flags = XBF_LOCK; | 2246 | buf_flags = XBF_LOCK; |
2247 | if (!(flags & XFS_BLI_INODE_BUF)) | 2247 | if (!(flags & XFS_BLF_INODE_BUF)) |
2248 | buf_flags |= XBF_MAPPED; | 2248 | buf_flags |= XBF_MAPPED; |
2249 | 2249 | ||
2250 | bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); | 2250 | bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); |
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans( | |||
2257 | } | 2257 | } |
2258 | 2258 | ||
2259 | error = 0; | 2259 | error = 0; |
2260 | if (flags & XFS_BLI_INODE_BUF) { | 2260 | if (flags & XFS_BLF_INODE_BUF) { |
2261 | error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); | 2261 | error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); |
2262 | } else if (flags & | 2262 | } else if (flags & |
2263 | (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { | 2263 | (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { |
2264 | xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); | 2264 | xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); |
2265 | } else { | 2265 | } else { |
2266 | xlog_recover_do_reg_buffer(mp, item, bp, buf_f); | 2266 | xlog_recover_do_reg_buffer(mp, item, bp, buf_f); |
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index 75d749207258..1c55ccbb379d 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h | |||
@@ -28,7 +28,7 @@ | |||
28 | #define XLOG_RHASH(tid) \ | 28 | #define XLOG_RHASH(tid) \ |
29 | ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) | 29 | ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) |
30 | 30 | ||
31 | #define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) | 31 | #define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) |
32 | 32 | ||
33 | 33 | ||
34 | /* | 34 | /* |
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 9ff48a16a7ee..1d2c7eed4eda 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h | |||
@@ -268,6 +268,7 @@ typedef struct xfs_mount { | |||
268 | #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops | 268 | #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops |
269 | must be synchronous except | 269 | must be synchronous except |
270 | for space allocations */ | 270 | for space allocations */ |
271 | #define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */ | ||
271 | #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ | 272 | #define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ |
272 | #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) | 273 | #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) |
273 | #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem | 274 | #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem |
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index be578ecb4af2..ce558efa2ea0 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include "xfs_trans_priv.h" | 44 | #include "xfs_trans_priv.h" |
45 | #include "xfs_trans_space.h" | 45 | #include "xfs_trans_space.h" |
46 | #include "xfs_inode_item.h" | 46 | #include "xfs_inode_item.h" |
47 | #include "xfs_trace.h" | ||
47 | 48 | ||
48 | kmem_zone_t *xfs_trans_zone; | 49 | kmem_zone_t *xfs_trans_zone; |
49 | 50 | ||
@@ -243,9 +244,8 @@ _xfs_trans_alloc( | |||
243 | tp->t_type = type; | 244 | tp->t_type = type; |
244 | tp->t_mountp = mp; | 245 | tp->t_mountp = mp; |
245 | tp->t_items_free = XFS_LIC_NUM_SLOTS; | 246 | tp->t_items_free = XFS_LIC_NUM_SLOTS; |
246 | tp->t_busy_free = XFS_LBC_NUM_SLOTS; | ||
247 | xfs_lic_init(&(tp->t_items)); | 247 | xfs_lic_init(&(tp->t_items)); |
248 | XFS_LBC_INIT(&(tp->t_busy)); | 248 | INIT_LIST_HEAD(&tp->t_busy); |
249 | return tp; | 249 | return tp; |
250 | } | 250 | } |
251 | 251 | ||
@@ -255,8 +255,13 @@ _xfs_trans_alloc( | |||
255 | */ | 255 | */ |
256 | STATIC void | 256 | STATIC void |
257 | xfs_trans_free( | 257 | xfs_trans_free( |
258 | xfs_trans_t *tp) | 258 | struct xfs_trans *tp) |
259 | { | 259 | { |
260 | struct xfs_busy_extent *busyp, *n; | ||
261 | |||
262 | list_for_each_entry_safe(busyp, n, &tp->t_busy, list) | ||
263 | xfs_alloc_busy_clear(tp->t_mountp, busyp); | ||
264 | |||
260 | atomic_dec(&tp->t_mountp->m_active_trans); | 265 | atomic_dec(&tp->t_mountp->m_active_trans); |
261 | xfs_trans_free_dqinfo(tp); | 266 | xfs_trans_free_dqinfo(tp); |
262 | kmem_zone_free(xfs_trans_zone, tp); | 267 | kmem_zone_free(xfs_trans_zone, tp); |
@@ -285,9 +290,8 @@ xfs_trans_dup( | |||
285 | ntp->t_type = tp->t_type; | 290 | ntp->t_type = tp->t_type; |
286 | ntp->t_mountp = tp->t_mountp; | 291 | ntp->t_mountp = tp->t_mountp; |
287 | ntp->t_items_free = XFS_LIC_NUM_SLOTS; | 292 | ntp->t_items_free = XFS_LIC_NUM_SLOTS; |
288 | ntp->t_busy_free = XFS_LBC_NUM_SLOTS; | ||
289 | xfs_lic_init(&(ntp->t_items)); | 293 | xfs_lic_init(&(ntp->t_items)); |
290 | XFS_LBC_INIT(&(ntp->t_busy)); | 294 | INIT_LIST_HEAD(&ntp->t_busy); |
291 | 295 | ||
292 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); | 296 | ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); |
293 | ASSERT(tp->t_ticket != NULL); | 297 | ASSERT(tp->t_ticket != NULL); |
@@ -423,7 +427,6 @@ undo_blocks: | |||
423 | return error; | 427 | return error; |
424 | } | 428 | } |
425 | 429 | ||
426 | |||
427 | /* | 430 | /* |
428 | * Record the indicated change to the given field for application | 431 | * Record the indicated change to the given field for application |
429 | * to the file system's superblock when the transaction commits. | 432 | * to the file system's superblock when the transaction commits. |
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas( | |||
652 | * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we | 655 | * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we |
653 | * still need to update the incore superblock with the changes. | 656 | * still need to update the incore superblock with the changes. |
654 | */ | 657 | */ |
655 | STATIC void | 658 | void |
656 | xfs_trans_unreserve_and_mod_sb( | 659 | xfs_trans_unreserve_and_mod_sb( |
657 | xfs_trans_t *tp) | 660 | xfs_trans_t *tp) |
658 | { | 661 | { |
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs( | |||
880 | * they could be immediately flushed and we'd have to race with the flusher | 883 | * they could be immediately flushed and we'd have to race with the flusher |
881 | * trying to pull the item from the AIL as we add it. | 884 | * trying to pull the item from the AIL as we add it. |
882 | */ | 885 | */ |
883 | static void | 886 | void |
884 | xfs_trans_item_committed( | 887 | xfs_trans_item_committed( |
885 | struct xfs_log_item *lip, | 888 | struct xfs_log_item *lip, |
886 | xfs_lsn_t commit_lsn, | 889 | xfs_lsn_t commit_lsn, |
@@ -930,26 +933,6 @@ xfs_trans_item_committed( | |||
930 | IOP_UNPIN(lip); | 933 | IOP_UNPIN(lip); |
931 | } | 934 | } |
932 | 935 | ||
933 | /* Clear all the per-AG busy list items listed in this transaction */ | ||
934 | static void | ||
935 | xfs_trans_clear_busy_extents( | ||
936 | struct xfs_trans *tp) | ||
937 | { | ||
938 | xfs_log_busy_chunk_t *lbcp; | ||
939 | xfs_log_busy_slot_t *lbsp; | ||
940 | int i; | ||
941 | |||
942 | for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) { | ||
943 | i = 0; | ||
944 | for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { | ||
945 | if (XFS_LBC_ISFREE(lbcp, i)) | ||
946 | continue; | ||
947 | xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx); | ||
948 | } | ||
949 | } | ||
950 | xfs_trans_free_busy(tp); | ||
951 | } | ||
952 | |||
953 | /* | 936 | /* |
954 | * This is typically called by the LM when a transaction has been fully | 937 | * This is typically called by the LM when a transaction has been fully |
955 | * committed to disk. It needs to unpin the items which have | 938 | * committed to disk. It needs to unpin the items which have |
@@ -984,7 +967,6 @@ xfs_trans_committed( | |||
984 | kmem_free(licp); | 967 | kmem_free(licp); |
985 | } | 968 | } |
986 | 969 | ||
987 | xfs_trans_clear_busy_extents(tp); | ||
988 | xfs_trans_free(tp); | 970 | xfs_trans_free(tp); |
989 | } | 971 | } |
990 | 972 | ||
@@ -1012,8 +994,7 @@ xfs_trans_uncommit( | |||
1012 | xfs_trans_unreserve_and_mod_sb(tp); | 994 | xfs_trans_unreserve_and_mod_sb(tp); |
1013 | xfs_trans_unreserve_and_mod_dquots(tp); | 995 | xfs_trans_unreserve_and_mod_dquots(tp); |
1014 | 996 | ||
1015 | xfs_trans_free_items(tp, flags); | 997 | xfs_trans_free_items(tp, NULLCOMMITLSN, flags); |
1016 | xfs_trans_free_busy(tp); | ||
1017 | xfs_trans_free(tp); | 998 | xfs_trans_free(tp); |
1018 | } | 999 | } |
1019 | 1000 | ||
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog( | |||
1075 | *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); | 1056 | *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); |
1076 | 1057 | ||
1077 | tp->t_commit_lsn = *commit_lsn; | 1058 | tp->t_commit_lsn = *commit_lsn; |
1059 | trace_xfs_trans_commit_lsn(tp); | ||
1060 | |||
1078 | if (nvec > XFS_TRANS_LOGVEC_COUNT) | 1061 | if (nvec > XFS_TRANS_LOGVEC_COUNT) |
1079 | kmem_free(log_vector); | 1062 | kmem_free(log_vector); |
1080 | 1063 | ||
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog( | |||
1161 | return xfs_log_release_iclog(mp, commit_iclog); | 1144 | return xfs_log_release_iclog(mp, commit_iclog); |
1162 | } | 1145 | } |
1163 | 1146 | ||
1147 | /* | ||
1148 | * Walk the log items and allocate log vector structures for | ||
1149 | * each item large enough to fit all the vectors they require. | ||
1150 | * Note that this format differs from the old log vector format in | ||
1151 | * that there is no transaction header in these log vectors. | ||
1152 | */ | ||
1153 | STATIC struct xfs_log_vec * | ||
1154 | xfs_trans_alloc_log_vecs( | ||
1155 | xfs_trans_t *tp) | ||
1156 | { | ||
1157 | xfs_log_item_desc_t *lidp; | ||
1158 | struct xfs_log_vec *lv = NULL; | ||
1159 | struct xfs_log_vec *ret_lv = NULL; | ||
1160 | |||
1161 | lidp = xfs_trans_first_item(tp); | ||
1162 | |||
1163 | /* Bail out if we didn't find a log item. */ | ||
1164 | if (!lidp) { | ||
1165 | ASSERT(0); | ||
1166 | return NULL; | ||
1167 | } | ||
1168 | |||
1169 | while (lidp != NULL) { | ||
1170 | struct xfs_log_vec *new_lv; | ||
1171 | |||
1172 | /* Skip items which aren't dirty in this transaction. */ | ||
1173 | if (!(lidp->lid_flags & XFS_LID_DIRTY)) { | ||
1174 | lidp = xfs_trans_next_item(tp, lidp); | ||
1175 | continue; | ||
1176 | } | ||
1177 | |||
1178 | /* Skip items that do not have any vectors for writing */ | ||
1179 | lidp->lid_size = IOP_SIZE(lidp->lid_item); | ||
1180 | if (!lidp->lid_size) { | ||
1181 | lidp = xfs_trans_next_item(tp, lidp); | ||
1182 | continue; | ||
1183 | } | ||
1184 | |||
1185 | new_lv = kmem_zalloc(sizeof(*new_lv) + | ||
1186 | lidp->lid_size * sizeof(struct xfs_log_iovec), | ||
1187 | KM_SLEEP); | ||
1188 | |||
1189 | /* The allocated iovec region lies beyond the log vector. */ | ||
1190 | new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; | ||
1191 | new_lv->lv_niovecs = lidp->lid_size; | ||
1192 | new_lv->lv_item = lidp->lid_item; | ||
1193 | if (!ret_lv) | ||
1194 | ret_lv = new_lv; | ||
1195 | else | ||
1196 | lv->lv_next = new_lv; | ||
1197 | lv = new_lv; | ||
1198 | lidp = xfs_trans_next_item(tp, lidp); | ||
1199 | } | ||
1200 | |||
1201 | return ret_lv; | ||
1202 | } | ||
1203 | |||
1204 | static int | ||
1205 | xfs_trans_commit_cil( | ||
1206 | struct xfs_mount *mp, | ||
1207 | struct xfs_trans *tp, | ||
1208 | xfs_lsn_t *commit_lsn, | ||
1209 | int flags) | ||
1210 | { | ||
1211 | struct xfs_log_vec *log_vector; | ||
1212 | int error; | ||
1213 | |||
1214 | /* | ||
1215 | * Get each log item to allocate a vector structure for | ||
1216 | * the log item to to pass to the log write code. The | ||
1217 | * CIL commit code will format the vector and save it away. | ||
1218 | */ | ||
1219 | log_vector = xfs_trans_alloc_log_vecs(tp); | ||
1220 | if (!log_vector) | ||
1221 | return ENOMEM; | ||
1222 | |||
1223 | error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags); | ||
1224 | if (error) | ||
1225 | return error; | ||
1226 | |||
1227 | current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); | ||
1228 | |||
1229 | /* xfs_trans_free_items() unlocks them first */ | ||
1230 | xfs_trans_free_items(tp, *commit_lsn, 0); | ||
1231 | xfs_trans_free(tp); | ||
1232 | return 0; | ||
1233 | } | ||
1164 | 1234 | ||
1165 | /* | 1235 | /* |
1166 | * xfs_trans_commit | 1236 | * xfs_trans_commit |
@@ -1221,7 +1291,11 @@ _xfs_trans_commit( | |||
1221 | xfs_trans_apply_sb_deltas(tp); | 1291 | xfs_trans_apply_sb_deltas(tp); |
1222 | xfs_trans_apply_dquot_deltas(tp); | 1292 | xfs_trans_apply_dquot_deltas(tp); |
1223 | 1293 | ||
1224 | error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); | 1294 | if (mp->m_flags & XFS_MOUNT_DELAYLOG) |
1295 | error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags); | ||
1296 | else | ||
1297 | error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); | ||
1298 | |||
1225 | if (error == ENOMEM) { | 1299 | if (error == ENOMEM) { |
1226 | xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); | 1300 | xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); |
1227 | error = XFS_ERROR(EIO); | 1301 | error = XFS_ERROR(EIO); |
@@ -1259,8 +1333,7 @@ out_unreserve: | |||
1259 | error = XFS_ERROR(EIO); | 1333 | error = XFS_ERROR(EIO); |
1260 | } | 1334 | } |
1261 | current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); | 1335 | current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); |
1262 | xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); | 1336 | xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); |
1263 | xfs_trans_free_busy(tp); | ||
1264 | xfs_trans_free(tp); | 1337 | xfs_trans_free(tp); |
1265 | 1338 | ||
1266 | XFS_STATS_INC(xs_trans_empty); | 1339 | XFS_STATS_INC(xs_trans_empty); |
@@ -1338,8 +1411,7 @@ xfs_trans_cancel( | |||
1338 | /* mark this thread as no longer being in a transaction */ | 1411 | /* mark this thread as no longer being in a transaction */ |
1339 | current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); | 1412 | current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); |
1340 | 1413 | ||
1341 | xfs_trans_free_items(tp, flags); | 1414 | xfs_trans_free_items(tp, NULLCOMMITLSN, flags); |
1342 | xfs_trans_free_busy(tp); | ||
1343 | xfs_trans_free(tp); | 1415 | xfs_trans_free(tp); |
1344 | } | 1416 | } |
1345 | 1417 | ||
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c62beee0921e..8c69e7824f68 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h | |||
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header { | |||
106 | #define XFS_TRANS_GROWFSRT_FREE 39 | 106 | #define XFS_TRANS_GROWFSRT_FREE 39 |
107 | #define XFS_TRANS_SWAPEXT 40 | 107 | #define XFS_TRANS_SWAPEXT 40 |
108 | #define XFS_TRANS_SB_COUNT 41 | 108 | #define XFS_TRANS_SB_COUNT 41 |
109 | #define XFS_TRANS_TYPE_MAX 41 | 109 | #define XFS_TRANS_CHECKPOINT 42 |
110 | #define XFS_TRANS_TYPE_MAX 42 | ||
110 | /* new transaction types need to be reflected in xfs_logprint(8) */ | 111 | /* new transaction types need to be reflected in xfs_logprint(8) */ |
111 | 112 | ||
112 | #define XFS_TRANS_TYPES \ | 113 | #define XFS_TRANS_TYPES \ |
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header { | |||
148 | { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ | 149 | { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ |
149 | { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ | 150 | { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ |
150 | { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ | 151 | { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ |
152 | { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ | ||
151 | { XFS_TRANS_DUMMY1, "DUMMY1" }, \ | 153 | { XFS_TRANS_DUMMY1, "DUMMY1" }, \ |
152 | { XFS_TRANS_DUMMY2, "DUMMY2" }, \ | 154 | { XFS_TRANS_DUMMY2, "DUMMY2" }, \ |
153 | { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } | 155 | { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } |
@@ -813,6 +815,7 @@ struct xfs_log_item_desc; | |||
813 | struct xfs_mount; | 815 | struct xfs_mount; |
814 | struct xfs_trans; | 816 | struct xfs_trans; |
815 | struct xfs_dquot_acct; | 817 | struct xfs_dquot_acct; |
818 | struct xfs_busy_extent; | ||
816 | 819 | ||
817 | typedef struct xfs_log_item { | 820 | typedef struct xfs_log_item { |
818 | struct list_head li_ail; /* AIL pointers */ | 821 | struct list_head li_ail; /* AIL pointers */ |
@@ -828,6 +831,11 @@ typedef struct xfs_log_item { | |||
828 | /* buffer item iodone */ | 831 | /* buffer item iodone */ |
829 | /* callback func */ | 832 | /* callback func */ |
830 | struct xfs_item_ops *li_ops; /* function list */ | 833 | struct xfs_item_ops *li_ops; /* function list */ |
834 | |||
835 | /* delayed logging */ | ||
836 | struct list_head li_cil; /* CIL pointers */ | ||
837 | struct xfs_log_vec *li_lv; /* active log vector */ | ||
838 | xfs_lsn_t li_seq; /* CIL commit seq */ | ||
831 | } xfs_log_item_t; | 839 | } xfs_log_item_t; |
832 | 840 | ||
833 | #define XFS_LI_IN_AIL 0x1 | 841 | #define XFS_LI_IN_AIL 0x1 |
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops { | |||
872 | #define XFS_ITEM_PUSHBUF 3 | 880 | #define XFS_ITEM_PUSHBUF 3 |
873 | 881 | ||
874 | /* | 882 | /* |
875 | * This structure is used to maintain a list of block ranges that have been | ||
876 | * freed in the transaction. The ranges are listed in the perag[] busy list | ||
877 | * between when they're freed and the transaction is committed to disk. | ||
878 | */ | ||
879 | |||
880 | typedef struct xfs_log_busy_slot { | ||
881 | xfs_agnumber_t lbc_ag; | ||
882 | ushort lbc_idx; /* index in perag.busy[] */ | ||
883 | } xfs_log_busy_slot_t; | ||
884 | |||
885 | #define XFS_LBC_NUM_SLOTS 31 | ||
886 | typedef struct xfs_log_busy_chunk { | ||
887 | struct xfs_log_busy_chunk *lbc_next; | ||
888 | uint lbc_free; /* free slots bitmask */ | ||
889 | ushort lbc_unused; /* first unused */ | ||
890 | xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; | ||
891 | } xfs_log_busy_chunk_t; | ||
892 | |||
893 | #define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) | ||
894 | #define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) | ||
895 | |||
896 | #define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) | ||
897 | #define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) | ||
898 | #define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) | ||
899 | #define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) | ||
900 | #define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) | ||
901 | |||
902 | /* | ||
903 | * This is the type of function which can be given to xfs_trans_callback() | 883 | * This is the type of function which can be given to xfs_trans_callback() |
904 | * to be called upon the transaction's commit to disk. | 884 | * to be called upon the transaction's commit to disk. |
905 | */ | 885 | */ |
@@ -950,8 +930,7 @@ typedef struct xfs_trans { | |||
950 | unsigned int t_items_free; /* log item descs free */ | 930 | unsigned int t_items_free; /* log item descs free */ |
951 | xfs_log_item_chunk_t t_items; /* first log item desc chunk */ | 931 | xfs_log_item_chunk_t t_items; /* first log item desc chunk */ |
952 | xfs_trans_header_t t_header; /* header for in-log trans */ | 932 | xfs_trans_header_t t_header; /* header for in-log trans */ |
953 | unsigned int t_busy_free; /* busy descs free */ | 933 | struct list_head t_busy; /* list of busy extents */ |
954 | xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ | ||
955 | unsigned long t_pflags; /* saved process flags state */ | 934 | unsigned long t_pflags; /* saved process flags state */ |
956 | } xfs_trans_t; | 935 | } xfs_trans_t; |
957 | 936 | ||
@@ -1025,9 +1004,6 @@ int _xfs_trans_commit(xfs_trans_t *, | |||
1025 | void xfs_trans_cancel(xfs_trans_t *, int); | 1004 | void xfs_trans_cancel(xfs_trans_t *, int); |
1026 | int xfs_trans_ail_init(struct xfs_mount *); | 1005 | int xfs_trans_ail_init(struct xfs_mount *); |
1027 | void xfs_trans_ail_destroy(struct xfs_mount *); | 1006 | void xfs_trans_ail_destroy(struct xfs_mount *); |
1028 | xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, | ||
1029 | xfs_agnumber_t ag, | ||
1030 | xfs_extlen_t idx); | ||
1031 | 1007 | ||
1032 | extern kmem_zone_t *xfs_trans_zone; | 1008 | extern kmem_zone_t *xfs_trans_zone; |
1033 | 1009 | ||
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 9cd809025f3a..63d81a22f4fd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c | |||
@@ -114,7 +114,7 @@ _xfs_trans_bjoin( | |||
114 | xfs_buf_item_init(bp, tp->t_mountp); | 114 | xfs_buf_item_init(bp, tp->t_mountp); |
115 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 115 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
116 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); | 116 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); |
117 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); | 117 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); |
118 | ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); | 118 | ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); |
119 | if (reset_recur) | 119 | if (reset_recur) |
120 | bip->bli_recur = 0; | 120 | bip->bli_recur = 0; |
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp, | |||
511 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 511 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
512 | ASSERT(bip->bli_item.li_type == XFS_LI_BUF); | 512 | ASSERT(bip->bli_item.li_type == XFS_LI_BUF); |
513 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); | 513 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); |
514 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); | 514 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); |
515 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 515 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
516 | 516 | ||
517 | /* | 517 | /* |
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp, | |||
619 | 619 | ||
620 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 620 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
621 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); | 621 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); |
622 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); | 622 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); |
623 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 623 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
624 | bip->bli_flags |= XFS_BLI_HOLD; | 624 | bip->bli_flags |= XFS_BLI_HOLD; |
625 | trace_xfs_trans_bhold(bip); | 625 | trace_xfs_trans_bhold(bip); |
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp, | |||
641 | 641 | ||
642 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 642 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
643 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); | 643 | ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); |
644 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); | 644 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL)); |
645 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 645 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
646 | ASSERT(bip->bli_flags & XFS_BLI_HOLD); | 646 | ASSERT(bip->bli_flags & XFS_BLI_HOLD); |
647 | bip->bli_flags &= ~XFS_BLI_HOLD; | 647 | bip->bli_flags &= ~XFS_BLI_HOLD; |
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp, | |||
704 | bip->bli_flags &= ~XFS_BLI_STALE; | 704 | bip->bli_flags &= ~XFS_BLI_STALE; |
705 | ASSERT(XFS_BUF_ISSTALE(bp)); | 705 | ASSERT(XFS_BUF_ISSTALE(bp)); |
706 | XFS_BUF_UNSTALE(bp); | 706 | XFS_BUF_UNSTALE(bp); |
707 | bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; | 707 | bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL; |
708 | } | 708 | } |
709 | 709 | ||
710 | lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); | 710 | lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); |
@@ -762,8 +762,8 @@ xfs_trans_binval( | |||
762 | ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); | 762 | ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); |
763 | ASSERT(XFS_BUF_ISSTALE(bp)); | 763 | ASSERT(XFS_BUF_ISSTALE(bp)); |
764 | ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); | 764 | ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); |
765 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); | 765 | ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); |
766 | ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); | 766 | ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); |
767 | ASSERT(lidp->lid_flags & XFS_LID_DIRTY); | 767 | ASSERT(lidp->lid_flags & XFS_LID_DIRTY); |
768 | ASSERT(tp->t_flags & XFS_TRANS_DIRTY); | 768 | ASSERT(tp->t_flags & XFS_TRANS_DIRTY); |
769 | return; | 769 | return; |
@@ -774,7 +774,7 @@ xfs_trans_binval( | |||
774 | * in the buf log item. The STALE flag will be used in | 774 | * in the buf log item. The STALE flag will be used in |
775 | * xfs_buf_item_unpin() to determine if it should clean up | 775 | * xfs_buf_item_unpin() to determine if it should clean up |
776 | * when the last reference to the buf item is given up. | 776 | * when the last reference to the buf item is given up. |
777 | * We set the XFS_BLI_CANCEL flag in the buf log format structure | 777 | * We set the XFS_BLF_CANCEL flag in the buf log format structure |
778 | * and log the buf item. This will be used at recovery time | 778 | * and log the buf item. This will be used at recovery time |
779 | * to determine that copies of the buffer in the log before | 779 | * to determine that copies of the buffer in the log before |
780 | * this should not be replayed. | 780 | * this should not be replayed. |
@@ -792,9 +792,9 @@ xfs_trans_binval( | |||
792 | XFS_BUF_UNDELAYWRITE(bp); | 792 | XFS_BUF_UNDELAYWRITE(bp); |
793 | XFS_BUF_STALE(bp); | 793 | XFS_BUF_STALE(bp); |
794 | bip->bli_flags |= XFS_BLI_STALE; | 794 | bip->bli_flags |= XFS_BLI_STALE; |
795 | bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); | 795 | bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); |
796 | bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; | 796 | bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; |
797 | bip->bli_format.blf_flags |= XFS_BLI_CANCEL; | 797 | bip->bli_format.blf_flags |= XFS_BLF_CANCEL; |
798 | memset((char *)(bip->bli_format.blf_data_map), 0, | 798 | memset((char *)(bip->bli_format.blf_data_map), 0, |
799 | (bip->bli_format.blf_map_size * sizeof(uint))); | 799 | (bip->bli_format.blf_map_size * sizeof(uint))); |
800 | lidp->lid_flags |= XFS_LID_DIRTY; | 800 | lidp->lid_flags |= XFS_LID_DIRTY; |
@@ -802,16 +802,16 @@ xfs_trans_binval( | |||
802 | } | 802 | } |
803 | 803 | ||
804 | /* | 804 | /* |
805 | * This call is used to indicate that the buffer contains on-disk | 805 | * This call is used to indicate that the buffer contains on-disk inodes which |
806 | * inodes which must be handled specially during recovery. They | 806 | * must be handled specially during recovery. They require special handling |
807 | * require special handling because only the di_next_unlinked from | 807 | * because only the di_next_unlinked from the inodes in the buffer should be |
808 | * the inodes in the buffer should be recovered. The rest of the | 808 | * recovered. The rest of the data in the buffer is logged via the inodes |
809 | * data in the buffer is logged via the inodes themselves. | 809 | * themselves. |
810 | * | 810 | * |
811 | * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log | 811 | * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be |
812 | * format structure so that we'll know what to do at recovery time. | 812 | * transferred to the buffer's log format structure so that we'll know what to |
813 | * do at recovery time. | ||
813 | */ | 814 | */ |
814 | /* ARGSUSED */ | ||
815 | void | 815 | void |
816 | xfs_trans_inode_buf( | 816 | xfs_trans_inode_buf( |
817 | xfs_trans_t *tp, | 817 | xfs_trans_t *tp, |
@@ -826,7 +826,7 @@ xfs_trans_inode_buf( | |||
826 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 826 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
827 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 827 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
828 | 828 | ||
829 | bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; | 829 | bip->bli_flags |= XFS_BLI_INODE_BUF; |
830 | } | 830 | } |
831 | 831 | ||
832 | /* | 832 | /* |
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf( | |||
908 | ASSERT(XFS_BUF_ISBUSY(bp)); | 908 | ASSERT(XFS_BUF_ISBUSY(bp)); |
909 | ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); | 909 | ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); |
910 | ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); | 910 | ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); |
911 | ASSERT(type == XFS_BLI_UDQUOT_BUF || | 911 | ASSERT(type == XFS_BLF_UDQUOT_BUF || |
912 | type == XFS_BLI_PDQUOT_BUF || | 912 | type == XFS_BLF_PDQUOT_BUF || |
913 | type == XFS_BLI_GDQUOT_BUF); | 913 | type == XFS_BLF_GDQUOT_BUF); |
914 | 914 | ||
915 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); | 915 | bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); |
916 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | 916 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c index eb3fc57f9eef..f11d37d06dcc 100644 --- a/fs/xfs/xfs_trans_item.c +++ b/fs/xfs/xfs_trans_item.c | |||
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) | |||
299 | void | 299 | void |
300 | xfs_trans_free_items( | 300 | xfs_trans_free_items( |
301 | xfs_trans_t *tp, | 301 | xfs_trans_t *tp, |
302 | xfs_lsn_t commit_lsn, | ||
302 | int flags) | 303 | int flags) |
303 | { | 304 | { |
304 | xfs_log_item_chunk_t *licp; | 305 | xfs_log_item_chunk_t *licp; |
@@ -311,7 +312,7 @@ xfs_trans_free_items( | |||
311 | * Special case the embedded chunk so we don't free it below. | 312 | * Special case the embedded chunk so we don't free it below. |
312 | */ | 313 | */ |
313 | if (!xfs_lic_are_all_free(licp)) { | 314 | if (!xfs_lic_are_all_free(licp)) { |
314 | (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); | 315 | (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); |
315 | xfs_lic_all_free(licp); | 316 | xfs_lic_all_free(licp); |
316 | licp->lic_unused = 0; | 317 | licp->lic_unused = 0; |
317 | } | 318 | } |
@@ -322,7 +323,7 @@ xfs_trans_free_items( | |||
322 | */ | 323 | */ |
323 | while (licp != NULL) { | 324 | while (licp != NULL) { |
324 | ASSERT(!xfs_lic_are_all_free(licp)); | 325 | ASSERT(!xfs_lic_are_all_free(licp)); |
325 | (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); | 326 | (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn); |
326 | next_licp = licp->lic_next; | 327 | next_licp = licp->lic_next; |
327 | kmem_free(licp); | 328 | kmem_free(licp); |
328 | licp = next_licp; | 329 | licp = next_licp; |
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk( | |||
438 | 439 | ||
439 | return freed; | 440 | return freed; |
440 | } | 441 | } |
441 | |||
442 | |||
443 | /* | ||
444 | * This is called to add the given busy item to the transaction's | ||
445 | * list of busy items. It must find a free busy item descriptor | ||
446 | * or allocate a new one and add the item to that descriptor. | ||
447 | * The function returns a pointer to busy descriptor used to point | ||
448 | * to the new busy entry. The log busy entry will now point to its new | ||
449 | * descriptor with its ???? field. | ||
450 | */ | ||
451 | xfs_log_busy_slot_t * | ||
452 | xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) | ||
453 | { | ||
454 | xfs_log_busy_chunk_t *lbcp; | ||
455 | xfs_log_busy_slot_t *lbsp; | ||
456 | int i=0; | ||
457 | |||
458 | /* | ||
459 | * If there are no free descriptors, allocate a new chunk | ||
460 | * of them and put it at the front of the chunk list. | ||
461 | */ | ||
462 | if (tp->t_busy_free == 0) { | ||
463 | lbcp = (xfs_log_busy_chunk_t*) | ||
464 | kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); | ||
465 | ASSERT(lbcp != NULL); | ||
466 | /* | ||
467 | * Initialize the chunk, and then | ||
468 | * claim the first slot in the newly allocated chunk. | ||
469 | */ | ||
470 | XFS_LBC_INIT(lbcp); | ||
471 | XFS_LBC_CLAIM(lbcp, 0); | ||
472 | lbcp->lbc_unused = 1; | ||
473 | lbsp = XFS_LBC_SLOT(lbcp, 0); | ||
474 | |||
475 | /* | ||
476 | * Link in the new chunk and update the free count. | ||
477 | */ | ||
478 | lbcp->lbc_next = tp->t_busy.lbc_next; | ||
479 | tp->t_busy.lbc_next = lbcp; | ||
480 | tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; | ||
481 | |||
482 | /* | ||
483 | * Initialize the descriptor and the generic portion | ||
484 | * of the log item. | ||
485 | * | ||
486 | * Point the new slot at this item and return it. | ||
487 | * Also point the log item at its currently active | ||
488 | * descriptor and set the item's mount pointer. | ||
489 | */ | ||
490 | lbsp->lbc_ag = ag; | ||
491 | lbsp->lbc_idx = idx; | ||
492 | return lbsp; | ||
493 | } | ||
494 | |||
495 | /* | ||
496 | * Find the free descriptor. It is somewhere in the chunklist | ||
497 | * of descriptors. | ||
498 | */ | ||
499 | lbcp = &tp->t_busy; | ||
500 | while (lbcp != NULL) { | ||
501 | if (XFS_LBC_VACANCY(lbcp)) { | ||
502 | if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { | ||
503 | i = lbcp->lbc_unused; | ||
504 | break; | ||
505 | } else { | ||
506 | /* out-of-order vacancy */ | ||
507 | cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); | ||
508 | ASSERT(0); | ||
509 | } | ||
510 | } | ||
511 | lbcp = lbcp->lbc_next; | ||
512 | } | ||
513 | ASSERT(lbcp != NULL); | ||
514 | /* | ||
515 | * If we find a free descriptor, claim it, | ||
516 | * initialize it, and return it. | ||
517 | */ | ||
518 | XFS_LBC_CLAIM(lbcp, i); | ||
519 | if (lbcp->lbc_unused <= i) { | ||
520 | lbcp->lbc_unused = i + 1; | ||
521 | } | ||
522 | lbsp = XFS_LBC_SLOT(lbcp, i); | ||
523 | tp->t_busy_free--; | ||
524 | lbsp->lbc_ag = ag; | ||
525 | lbsp->lbc_idx = idx; | ||
526 | return lbsp; | ||
527 | } | ||
528 | |||
529 | |||
530 | /* | ||
531 | * xfs_trans_free_busy | ||
532 | * Free all of the busy lists from a transaction | ||
533 | */ | ||
534 | void | ||
535 | xfs_trans_free_busy(xfs_trans_t *tp) | ||
536 | { | ||
537 | xfs_log_busy_chunk_t *lbcp; | ||
538 | xfs_log_busy_chunk_t *lbcq; | ||
539 | |||
540 | lbcp = tp->t_busy.lbc_next; | ||
541 | while (lbcp != NULL) { | ||
542 | lbcq = lbcp->lbc_next; | ||
543 | kmem_free(lbcp); | ||
544 | lbcp = lbcq; | ||
545 | } | ||
546 | |||
547 | XFS_LBC_INIT(&tp->t_busy); | ||
548 | tp->t_busy.lbc_unused = 0; | ||
549 | } | ||
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 73e2ad397432..c6e4f2c8de6e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h | |||
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, | |||
35 | struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); | 35 | struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); |
36 | struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, | 36 | struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, |
37 | struct xfs_log_item_desc *); | 37 | struct xfs_log_item_desc *); |
38 | void xfs_trans_free_items(struct xfs_trans *, int); | 38 | |
39 | void xfs_trans_unlock_items(struct xfs_trans *, | 39 | void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn); |
40 | xfs_lsn_t); | 40 | void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, |
41 | void xfs_trans_free_busy(xfs_trans_t *tp); | 41 | int flags); |
42 | xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, | 42 | |
43 | xfs_agnumber_t ag, | 43 | void xfs_trans_item_committed(struct xfs_log_item *lip, |
44 | xfs_extlen_t idx); | 44 | xfs_lsn_t commit_lsn, int aborted); |
45 | void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * AIL traversal cursor. | 48 | * AIL traversal cursor. |
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index b09904555d07..320775295e32 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h | |||
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ | |||
75 | 75 | ||
76 | typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ | 76 | typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ |
77 | 77 | ||
78 | typedef __uint32_t xlog_tid_t; /* transaction ID type */ | ||
79 | |||
78 | /* | 80 | /* |
79 | * These types are 64 bits on disk but are either 32 or 64 bits in memory. | 81 | * These types are 64 bits on disk but are either 32 or 64 bits in memory. |
80 | * Disk based types: | 82 | * Disk based types: |