aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/swap.h3
-rw-r--r--mm/swapfile.c119
2 files changed, 121 insertions, 1 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0b9210ea96c7..fe79f44c858e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -121,6 +121,7 @@ enum {
121 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ 121 SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
122 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ 122 SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
123 SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ 123 SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
124 SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
124 /* add others here before... */ 125 /* add others here before... */
125 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ 126 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
126}; 127};
@@ -144,6 +145,8 @@ struct swap_info_struct {
144 unsigned short *swap_map; 145 unsigned short *swap_map;
145 unsigned int lowest_bit; 146 unsigned int lowest_bit;
146 unsigned int highest_bit; 147 unsigned int highest_bit;
148 unsigned int lowest_alloc; /* while preparing discard cluster */
149 unsigned int highest_alloc; /* while preparing discard cluster */
147 unsigned int cluster_next; 150 unsigned int cluster_next;
148 unsigned int cluster_nr; 151 unsigned int cluster_nr;
149 unsigned int pages; 152 unsigned int pages;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fbeb4bb8eb50..ca75b9e7c09f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si)
115 return err; /* That will often be -EOPNOTSUPP */ 115 return err; /* That will often be -EOPNOTSUPP */
116} 116}
117 117
118/*
119 * swap allocation tell device that a cluster of swap can now be discarded,
120 * to allow the swap device to optimize its wear-levelling.
121 */
122static void discard_swap_cluster(struct swap_info_struct *si,
123 pgoff_t start_page, pgoff_t nr_pages)
124{
125 struct swap_extent *se = si->curr_swap_extent;
126 int found_extent = 0;
127
128 while (nr_pages) {
129 struct list_head *lh;
130
131 if (se->start_page <= start_page &&
132 start_page < se->start_page + se->nr_pages) {
133 pgoff_t offset = start_page - se->start_page;
134 sector_t start_block = se->start_block + offset;
135 pgoff_t nr_blocks = se->nr_pages - offset;
136
137 if (nr_blocks > nr_pages)
138 nr_blocks = nr_pages;
139 start_page += nr_blocks;
140 nr_pages -= nr_blocks;
141
142 if (!found_extent++)
143 si->curr_swap_extent = se;
144
145 start_block <<= PAGE_SHIFT - 9;
146 nr_blocks <<= PAGE_SHIFT - 9;
147 if (blkdev_issue_discard(si->bdev, start_block,
148 nr_blocks, GFP_NOIO))
149 break;
150 }
151
152 lh = se->list.next;
153 if (lh == &si->extent_list)
154 lh = lh->next;
155 se = list_entry(lh, struct swap_extent, list);
156 }
157}
158
159static int wait_for_discard(void *word)
160{
161 schedule();
162 return 0;
163}
164
118#define SWAPFILE_CLUSTER 256 165#define SWAPFILE_CLUSTER 256
119#define LATENCY_LIMIT 256 166#define LATENCY_LIMIT 256
120 167
121static inline unsigned long scan_swap_map(struct swap_info_struct *si) 168static inline unsigned long scan_swap_map(struct swap_info_struct *si)
122{ 169{
123 unsigned long offset; 170 unsigned long offset;
124 unsigned long last_in_cluster; 171 unsigned long last_in_cluster = 0;
125 int latency_ration = LATENCY_LIMIT; 172 int latency_ration = LATENCY_LIMIT;
173 int found_free_cluster = 0;
126 174
127 /* 175 /*
128 * We try to cluster swap pages by allocating them sequentially 176 * We try to cluster swap pages by allocating them sequentially
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
142 si->cluster_nr = SWAPFILE_CLUSTER - 1; 190 si->cluster_nr = SWAPFILE_CLUSTER - 1;
143 goto checks; 191 goto checks;
144 } 192 }
193 if (si->flags & SWP_DISCARDABLE) {
194 /*
195 * Start range check on racing allocations, in case
196 * they overlap the cluster we eventually decide on
197 * (we scan without swap_lock to allow preemption).
198 * It's hardly conceivable that cluster_nr could be
199 * wrapped during our scan, but don't depend on it.
200 */
201 if (si->lowest_alloc)
202 goto checks;
203 si->lowest_alloc = si->max;
204 si->highest_alloc = 0;
205 }
145 spin_unlock(&swap_lock); 206 spin_unlock(&swap_lock);
146 207
147 offset = si->lowest_bit; 208 offset = si->lowest_bit;
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
156 offset -= SWAPFILE_CLUSTER - 1; 217 offset -= SWAPFILE_CLUSTER - 1;
157 si->cluster_next = offset; 218 si->cluster_next = offset;
158 si->cluster_nr = SWAPFILE_CLUSTER - 1; 219 si->cluster_nr = SWAPFILE_CLUSTER - 1;
220 found_free_cluster = 1;
159 goto checks; 221 goto checks;
160 } 222 }
161 if (unlikely(--latency_ration < 0)) { 223 if (unlikely(--latency_ration < 0)) {
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
167 offset = si->lowest_bit; 229 offset = si->lowest_bit;
168 spin_lock(&swap_lock); 230 spin_lock(&swap_lock);
169 si->cluster_nr = SWAPFILE_CLUSTER - 1; 231 si->cluster_nr = SWAPFILE_CLUSTER - 1;
232 si->lowest_alloc = 0;
170 } 233 }
171 234
172checks: 235checks:
@@ -191,6 +254,60 @@ checks:
191 si->swap_map[offset] = 1; 254 si->swap_map[offset] = 1;
192 si->cluster_next = offset + 1; 255 si->cluster_next = offset + 1;
193 si->flags -= SWP_SCANNING; 256 si->flags -= SWP_SCANNING;
257
258 if (si->lowest_alloc) {
259 /*
260 * Only set when SWP_DISCARDABLE, and there's a scan
261 * for a free cluster in progress or just completed.
262 */
263 if (found_free_cluster) {
264 /*
265 * To optimize wear-levelling, discard the
266 * old data of the cluster, taking care not to
267 * discard any of its pages that have already
268 * been allocated by racing tasks (offset has
269 * already stepped over any at the beginning).
270 */
271 if (offset < si->highest_alloc &&
272 si->lowest_alloc <= last_in_cluster)
273 last_in_cluster = si->lowest_alloc - 1;
274 si->flags |= SWP_DISCARDING;
275 spin_unlock(&swap_lock);
276
277 if (offset < last_in_cluster)
278 discard_swap_cluster(si, offset,
279 last_in_cluster - offset + 1);
280
281 spin_lock(&swap_lock);
282 si->lowest_alloc = 0;
283 si->flags &= ~SWP_DISCARDING;
284
285 smp_mb(); /* wake_up_bit advises this */
286 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
287
288 } else if (si->flags & SWP_DISCARDING) {
289 /*
290 * Delay using pages allocated by racing tasks
291 * until the whole discard has been issued. We
292 * could defer that delay until swap_writepage,
293 * but it's easier to keep this self-contained.
294 */
295 spin_unlock(&swap_lock);
296 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
297 wait_for_discard, TASK_UNINTERRUPTIBLE);
298 spin_lock(&swap_lock);
299 } else {
300 /*
301 * Note pages allocated by racing tasks while
302 * scan for a free cluster is in progress, so
303 * that its final discard can exclude them.
304 */
305 if (offset < si->lowest_alloc)
306 si->lowest_alloc = offset;
307 if (offset > si->highest_alloc)
308 si->highest_alloc = offset;
309 }
310 }
194 return offset; 311 return offset;
195 312
196scan: 313scan: