diff options
author | Hugh Dickins <hugh@veritas.com> | 2009-01-06 17:39:53 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-06 18:59:05 -0500 |
commit | 7992fde72ce06c73280a1939b7a1e903bc95ef85 (patch) | |
tree | 8e8ef30ec4e29b325f70c2d01d2a9def192b5c64 | |
parent | 6a6ba83175c029c7820765bae44692266b29e67a (diff) |
swapfile: swap allocation use discard
When scan_swap_map() finds a free cluster of swap pages to allocate,
discard the old contents of the cluster if the device supports discard.
But don't bother when swap is so fragmented that we allocate single pages.
Be careful about racing allocations made while we're scanning for a
cluster; and hold up allocations made while we're discarding.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Joern Engel <joern@logfs.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Donjun Shin <djshin90@gmail.com>
Cc: Tejun Heo <teheo@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 119 |
2 files changed, 121 insertions, 1 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 0b9210ea96c7..fe79f44c858e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -121,6 +121,7 @@ enum { | |||
121 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ | 121 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ |
122 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ | 122 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ |
123 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ | 123 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ |
124 | SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ | ||
124 | /* add others here before... */ | 125 | /* add others here before... */ |
125 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 126 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
126 | }; | 127 | }; |
@@ -144,6 +145,8 @@ struct swap_info_struct { | |||
144 | unsigned short *swap_map; | 145 | unsigned short *swap_map; |
145 | unsigned int lowest_bit; | 146 | unsigned int lowest_bit; |
146 | unsigned int highest_bit; | 147 | unsigned int highest_bit; |
148 | unsigned int lowest_alloc; /* while preparing discard cluster */ | ||
149 | unsigned int highest_alloc; /* while preparing discard cluster */ | ||
147 | unsigned int cluster_next; | 150 | unsigned int cluster_next; |
148 | unsigned int cluster_nr; | 151 | unsigned int cluster_nr; |
149 | unsigned int pages; | 152 | unsigned int pages; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fbeb4bb8eb50..ca75b9e7c09f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si) | |||
115 | return err; /* That will often be -EOPNOTSUPP */ | 115 | return err; /* That will often be -EOPNOTSUPP */ |
116 | } | 116 | } |
117 | 117 | ||
118 | /* | ||
119 | * swap allocation tell device that a cluster of swap can now be discarded, | ||
120 | * to allow the swap device to optimize its wear-levelling. | ||
121 | */ | ||
122 | static void discard_swap_cluster(struct swap_info_struct *si, | ||
123 | pgoff_t start_page, pgoff_t nr_pages) | ||
124 | { | ||
125 | struct swap_extent *se = si->curr_swap_extent; | ||
126 | int found_extent = 0; | ||
127 | |||
128 | while (nr_pages) { | ||
129 | struct list_head *lh; | ||
130 | |||
131 | if (se->start_page <= start_page && | ||
132 | start_page < se->start_page + se->nr_pages) { | ||
133 | pgoff_t offset = start_page - se->start_page; | ||
134 | sector_t start_block = se->start_block + offset; | ||
135 | pgoff_t nr_blocks = se->nr_pages - offset; | ||
136 | |||
137 | if (nr_blocks > nr_pages) | ||
138 | nr_blocks = nr_pages; | ||
139 | start_page += nr_blocks; | ||
140 | nr_pages -= nr_blocks; | ||
141 | |||
142 | if (!found_extent++) | ||
143 | si->curr_swap_extent = se; | ||
144 | |||
145 | start_block <<= PAGE_SHIFT - 9; | ||
146 | nr_blocks <<= PAGE_SHIFT - 9; | ||
147 | if (blkdev_issue_discard(si->bdev, start_block, | ||
148 | nr_blocks, GFP_NOIO)) | ||
149 | break; | ||
150 | } | ||
151 | |||
152 | lh = se->list.next; | ||
153 | if (lh == &si->extent_list) | ||
154 | lh = lh->next; | ||
155 | se = list_entry(lh, struct swap_extent, list); | ||
156 | } | ||
157 | } | ||
158 | |||
159 | static int wait_for_discard(void *word) | ||
160 | { | ||
161 | schedule(); | ||
162 | return 0; | ||
163 | } | ||
164 | |||
118 | #define SWAPFILE_CLUSTER 256 | 165 | #define SWAPFILE_CLUSTER 256 |
119 | #define LATENCY_LIMIT 256 | 166 | #define LATENCY_LIMIT 256 |
120 | 167 | ||
121 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 168 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) |
122 | { | 169 | { |
123 | unsigned long offset; | 170 | unsigned long offset; |
124 | unsigned long last_in_cluster; | 171 | unsigned long last_in_cluster = 0; |
125 | int latency_ration = LATENCY_LIMIT; | 172 | int latency_ration = LATENCY_LIMIT; |
173 | int found_free_cluster = 0; | ||
126 | 174 | ||
127 | /* | 175 | /* |
128 | * We try to cluster swap pages by allocating them sequentially | 176 | * We try to cluster swap pages by allocating them sequentially |
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
142 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 190 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
143 | goto checks; | 191 | goto checks; |
144 | } | 192 | } |
193 | if (si->flags & SWP_DISCARDABLE) { | ||
194 | /* | ||
195 | * Start range check on racing allocations, in case | ||
196 | * they overlap the cluster we eventually decide on | ||
197 | * (we scan without swap_lock to allow preemption). | ||
198 | * It's hardly conceivable that cluster_nr could be | ||
199 | * wrapped during our scan, but don't depend on it. | ||
200 | */ | ||
201 | if (si->lowest_alloc) | ||
202 | goto checks; | ||
203 | si->lowest_alloc = si->max; | ||
204 | si->highest_alloc = 0; | ||
205 | } | ||
145 | spin_unlock(&swap_lock); | 206 | spin_unlock(&swap_lock); |
146 | 207 | ||
147 | offset = si->lowest_bit; | 208 | offset = si->lowest_bit; |
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
156 | offset -= SWAPFILE_CLUSTER - 1; | 217 | offset -= SWAPFILE_CLUSTER - 1; |
157 | si->cluster_next = offset; | 218 | si->cluster_next = offset; |
158 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 219 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
220 | found_free_cluster = 1; | ||
159 | goto checks; | 221 | goto checks; |
160 | } | 222 | } |
161 | if (unlikely(--latency_ration < 0)) { | 223 | if (unlikely(--latency_ration < 0)) { |
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
167 | offset = si->lowest_bit; | 229 | offset = si->lowest_bit; |
168 | spin_lock(&swap_lock); | 230 | spin_lock(&swap_lock); |
169 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 231 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
232 | si->lowest_alloc = 0; | ||
170 | } | 233 | } |
171 | 234 | ||
172 | checks: | 235 | checks: |
@@ -191,6 +254,60 @@ checks: | |||
191 | si->swap_map[offset] = 1; | 254 | si->swap_map[offset] = 1; |
192 | si->cluster_next = offset + 1; | 255 | si->cluster_next = offset + 1; |
193 | si->flags -= SWP_SCANNING; | 256 | si->flags -= SWP_SCANNING; |
257 | |||
258 | if (si->lowest_alloc) { | ||
259 | /* | ||
260 | * Only set when SWP_DISCARDABLE, and there's a scan | ||
261 | * for a free cluster in progress or just completed. | ||
262 | */ | ||
263 | if (found_free_cluster) { | ||
264 | /* | ||
265 | * To optimize wear-levelling, discard the | ||
266 | * old data of the cluster, taking care not to | ||
267 | * discard any of its pages that have already | ||
268 | * been allocated by racing tasks (offset has | ||
269 | * already stepped over any at the beginning). | ||
270 | */ | ||
271 | if (offset < si->highest_alloc && | ||
272 | si->lowest_alloc <= last_in_cluster) | ||
273 | last_in_cluster = si->lowest_alloc - 1; | ||
274 | si->flags |= SWP_DISCARDING; | ||
275 | spin_unlock(&swap_lock); | ||
276 | |||
277 | if (offset < last_in_cluster) | ||
278 | discard_swap_cluster(si, offset, | ||
279 | last_in_cluster - offset + 1); | ||
280 | |||
281 | spin_lock(&swap_lock); | ||
282 | si->lowest_alloc = 0; | ||
283 | si->flags &= ~SWP_DISCARDING; | ||
284 | |||
285 | smp_mb(); /* wake_up_bit advises this */ | ||
286 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
287 | |||
288 | } else if (si->flags & SWP_DISCARDING) { | ||
289 | /* | ||
290 | * Delay using pages allocated by racing tasks | ||
291 | * until the whole discard has been issued. We | ||
292 | * could defer that delay until swap_writepage, | ||
293 | * but it's easier to keep this self-contained. | ||
294 | */ | ||
295 | spin_unlock(&swap_lock); | ||
296 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
297 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
298 | spin_lock(&swap_lock); | ||
299 | } else { | ||
300 | /* | ||
301 | * Note pages allocated by racing tasks while | ||
302 | * scan for a free cluster is in progress, so | ||
303 | * that its final discard can exclude them. | ||
304 | */ | ||
305 | if (offset < si->lowest_alloc) | ||
306 | si->lowest_alloc = offset; | ||
307 | if (offset > si->highest_alloc) | ||
308 | si->highest_alloc = offset; | ||
309 | } | ||
310 | } | ||
194 | return offset; | 311 | return offset; |
195 | 312 | ||
196 | scan: | 313 | scan: |