aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2009-01-06 17:39:53 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-06 18:59:05 -0500
commit7992fde72ce06c73280a1939b7a1e903bc95ef85 (patch)
tree8e8ef30ec4e29b325f70c2d01d2a9def192b5c64
parent6a6ba83175c029c7820765bae44692266b29e67a (diff)
swapfile: swap allocation use discard
When scan_swap_map() finds a free cluster of swap pages to allocate, discard the old contents of the cluster if the device supports discard. But don't bother when swap is so fragmented that we allocate single pages. Be careful about racing allocations made while we're scanning for a cluster; and hold up allocations made while we're discarding. Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Joern Engel <joern@logfs.org> Cc: James Bottomley <James.Bottomley@HansenPartnership.com> Cc: Donjun Shin <djshin90@gmail.com> Cc: Tejun Heo <teheo@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/swap.h3
-rw-r--r--mm/swapfile.c119
2 files changed, 121 insertions, 1 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0b9210ea96c7..fe79f44c858e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -121,6 +121,7 @@ enum {
121 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ 121 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
122 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ 122 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
123 SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ 123 SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
124 SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
124 /* add others here before... */ 125 /* add others here before... */
125 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ 126 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
126}; 127};
@@ -144,6 +145,8 @@ struct swap_info_struct {
144 unsigned short *swap_map; 145 unsigned short *swap_map;
145 unsigned int lowest_bit; 146 unsigned int lowest_bit;
146 unsigned int highest_bit; 147 unsigned int highest_bit;
148 unsigned int lowest_alloc; /* while preparing discard cluster */
149 unsigned int highest_alloc; /* while preparing discard cluster */
147 unsigned int cluster_next; 150 unsigned int cluster_next;
148 unsigned int cluster_nr; 151 unsigned int cluster_nr;
149 unsigned int pages; 152 unsigned int pages;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fbeb4bb8eb50..ca75b9e7c09f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
115 return err; /* That will often be -EOPNOTSUPP */ 115 return err; /* That will often be -EOPNOTSUPP */
116} 116}
117 117
118/*
119 * swap allocation tell device that a cluster of swap can now be discarded,
120 * to allow the swap device to optimize its wear-levelling.
121 */
122static void discard_swap_cluster(struct swap_info_struct *si,
123 pgoff_t start_page, pgoff_t nr_pages)
124{
125 struct swap_extent *se = si->curr_swap_extent;
126 int found_extent = 0;
127
128 while (nr_pages) {
129 struct list_head *lh;
130
131 if (se->start_page <= start_page &&
132 start_page < se->start_page + se->nr_pages) {
133 pgoff_t offset = start_page - se->start_page;
134 sector_t start_block = se->start_block + offset;
135 pgoff_t nr_blocks = se->nr_pages - offset;
136
137 if (nr_blocks > nr_pages)
138 nr_blocks = nr_pages;
139 start_page += nr_blocks;
140 nr_pages -= nr_blocks;
141
142 if (!found_extent++)
143 si->curr_swap_extent = se;
144
145 start_block <<= PAGE_SHIFT - 9;
146 nr_blocks <<= PAGE_SHIFT - 9;
147 if (blkdev_issue_discard(si->bdev, start_block,
148 nr_blocks, GFP_NOIO))
149 break;
150 }
151
152 lh = se->list.next;
153 if (lh == &si->extent_list)
154 lh = lh->next;
155 se = list_entry(lh, struct swap_extent, list);
156 }
157}
158
159static int wait_for_discard(void *word)
160{
161 schedule();
162 return 0;
163}
164
118#define SWAPFILE_CLUSTER 256 165#define SWAPFILE_CLUSTER 256
119#define LATENCY_LIMIT 256 166#define LATENCY_LIMIT 256
120 167
121static inline unsigned long scan_swap_map(struct swap_info_struct *si) 168static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122{ 169{
123 unsigned long offset; 170 unsigned long offset;
124 unsigned long last_in_cluster; 171 unsigned long last_in_cluster = 0;
125 int latency_ration = LATENCY_LIMIT; 172 int latency_ration = LATENCY_LIMIT;
173 int found_free_cluster = 0;
126 174
127 /* 175 /*
128 * We try to cluster swap pages by allocating them sequentially 176 * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
142 si->cluster_nr = SWAPFILE_CLUSTER - 1; 190 si->cluster_nr = SWAPFILE_CLUSTER - 1;
143 goto checks; 191 goto checks;
144 } 192 }
193 if (si->flags & SWP_DISCARDABLE) {
194 /*
195 * Start range check on racing allocations, in case
196 * they overlap the cluster we eventually decide on
197 * (we scan without swap_lock to allow preemption).
198 * It's hardly conceivable that cluster_nr could be
199 * wrapped during our scan, but don't depend on it.
200 */
201 if (si->lowest_alloc)
202 goto checks;
203 si->lowest_alloc = si->max;
204 si->highest_alloc = 0;
205 }
145 spin_unlock(&swap_lock); 206 spin_unlock(&swap_lock);
146 207
147 offset = si->lowest_bit; 208 offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
156 offset -= SWAPFILE_CLUSTER - 1; 217 offset -= SWAPFILE_CLUSTER - 1;
157 si->cluster_next = offset; 218 si->cluster_next = offset;
158 si->cluster_nr = SWAPFILE_CLUSTER - 1; 219 si->cluster_nr = SWAPFILE_CLUSTER - 1;
220 found_free_cluster = 1;
159 goto checks; 221 goto checks;
160 } 222 }
161 if (unlikely(--latency_ration < 0)) { 223 if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
167 offset = si->lowest_bit; 229 offset = si->lowest_bit;
168 spin_lock(&swap_lock); 230 spin_lock(&swap_lock);
169 si->cluster_nr = SWAPFILE_CLUSTER - 1; 231 si->cluster_nr = SWAPFILE_CLUSTER - 1;
232 si->lowest_alloc = 0;
170 } 233 }
171 234
172checks: 235checks:
@@ -191,6 +254,60 @@ checks:
191 si->swap_map[offset] = 1; 254 si->swap_map[offset] = 1;
192 si->cluster_next = offset + 1; 255 si->cluster_next = offset + 1;
193 si->flags -= SWP_SCANNING; 256 si->flags -= SWP_SCANNING;
257
258 if (si->lowest_alloc) {
259 /*
260 * Only set when SWP_DISCARDABLE, and there's a scan
261 * for a free cluster in progress or just completed.
262 */
263 if (found_free_cluster) {
264 /*
265 * To optimize wear-levelling, discard the
266 * old data of the cluster, taking care not to
267 * discard any of its pages that have already
268 * been allocated by racing tasks (offset has
269 * already stepped over any at the beginning).
270 */
271 if (offset < si->highest_alloc &&
272 si->lowest_alloc <= last_in_cluster)
273 last_in_cluster = si->lowest_alloc - 1;
274 si->flags |= SWP_DISCARDING;
275 spin_unlock(&swap_lock);
276
277 if (offset < last_in_cluster)
278 discard_swap_cluster(si, offset,
279 last_in_cluster - offset + 1);
280
281 spin_lock(&swap_lock);
282 si->lowest_alloc = 0;
283 si->flags &= ~SWP_DISCARDING;
284
285 smp_mb(); /* wake_up_bit advises this */
286 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
287
288 } else if (si->flags & SWP_DISCARDING) {
289 /*
290 * Delay using pages allocated by racing tasks
291 * until the whole discard has been issued. We
292 * could defer that delay until swap_writepage,
293 * but it's easier to keep this self-contained.
294 */
295 spin_unlock(&swap_lock);
296 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
297 wait_for_discard, TASK_UNINTERRUPTIBLE);
298 spin_lock(&swap_lock);
299 } else {
300 /*
301 * Note pages allocated by racing tasks while
302 * scan for a free cluster is in progress, so
303 * that its final discard can exclude them.
304 */
305 if (offset < si->lowest_alloc)
306 si->lowest_alloc = offset;
307 if (offset > si->highest_alloc)
308 si->highest_alloc = offset;
309 }
310 }
194 return offset; 311 return offset;
195 312
196scan: 313scan: