diff options
-rw-r--r-- | include/linux/swap.h | 3 | ||||
-rw-r--r-- | mm/swapfile.c | 119 |
2 files changed, 121 insertions, 1 deletions
diff --git a/include/linux/swap.h b/include/linux/swap.h index 0b9210ea96c7..fe79f44c858e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -121,6 +121,7 @@ enum { | |||
121 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ | 121 | SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ |
122 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ | 122 | SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ |
123 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ | 123 | SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */ |
124 | SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ | ||
124 | /* add others here before... */ | 125 | /* add others here before... */ |
125 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 126 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
126 | }; | 127 | }; |
@@ -144,6 +145,8 @@ struct swap_info_struct { | |||
144 | unsigned short *swap_map; | 145 | unsigned short *swap_map; |
145 | unsigned int lowest_bit; | 146 | unsigned int lowest_bit; |
146 | unsigned int highest_bit; | 147 | unsigned int highest_bit; |
148 | unsigned int lowest_alloc; /* while preparing discard cluster */ | ||
149 | unsigned int highest_alloc; /* while preparing discard cluster */ | ||
147 | unsigned int cluster_next; | 150 | unsigned int cluster_next; |
148 | unsigned int cluster_nr; | 151 | unsigned int cluster_nr; |
149 | unsigned int pages; | 152 | unsigned int pages; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fbeb4bb8eb50..ca75b9e7c09f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -115,14 +115,62 @@ static int discard_swap(struct swap_info_struct *si) | |||
115 | return err; /* That will often be -EOPNOTSUPP */ | 115 | return err; /* That will often be -EOPNOTSUPP */ |
116 | } | 116 | } |
117 | 117 | ||
118 | /* | ||
119 | * swap allocation tell device that a cluster of swap can now be discarded, | ||
120 | * to allow the swap device to optimize its wear-levelling. | ||
121 | */ | ||
122 | static void discard_swap_cluster(struct swap_info_struct *si, | ||
123 | pgoff_t start_page, pgoff_t nr_pages) | ||
124 | { | ||
125 | struct swap_extent *se = si->curr_swap_extent; | ||
126 | int found_extent = 0; | ||
127 | |||
128 | while (nr_pages) { | ||
129 | struct list_head *lh; | ||
130 | |||
131 | if (se->start_page <= start_page && | ||
132 | start_page < se->start_page + se->nr_pages) { | ||
133 | pgoff_t offset = start_page - se->start_page; | ||
134 | sector_t start_block = se->start_block + offset; | ||
135 | pgoff_t nr_blocks = se->nr_pages - offset; | ||
136 | |||
137 | if (nr_blocks > nr_pages) | ||
138 | nr_blocks = nr_pages; | ||
139 | start_page += nr_blocks; | ||
140 | nr_pages -= nr_blocks; | ||
141 | |||
142 | if (!found_extent++) | ||
143 | si->curr_swap_extent = se; | ||
144 | |||
145 | start_block <<= PAGE_SHIFT - 9; | ||
146 | nr_blocks <<= PAGE_SHIFT - 9; | ||
147 | if (blkdev_issue_discard(si->bdev, start_block, | ||
148 | nr_blocks, GFP_NOIO)) | ||
149 | break; | ||
150 | } | ||
151 | |||
152 | lh = se->list.next; | ||
153 | if (lh == &si->extent_list) | ||
154 | lh = lh->next; | ||
155 | se = list_entry(lh, struct swap_extent, list); | ||
156 | } | ||
157 | } | ||
158 | |||
159 | static int wait_for_discard(void *word) | ||
160 | { | ||
161 | schedule(); | ||
162 | return 0; | ||
163 | } | ||
164 | |||
118 | #define SWAPFILE_CLUSTER 256 | 165 | #define SWAPFILE_CLUSTER 256 |
119 | #define LATENCY_LIMIT 256 | 166 | #define LATENCY_LIMIT 256 |
120 | 167 | ||
121 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 168 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) |
122 | { | 169 | { |
123 | unsigned long offset; | 170 | unsigned long offset; |
124 | unsigned long last_in_cluster; | 171 | unsigned long last_in_cluster = 0; |
125 | int latency_ration = LATENCY_LIMIT; | 172 | int latency_ration = LATENCY_LIMIT; |
173 | int found_free_cluster = 0; | ||
126 | 174 | ||
127 | /* | 175 | /* |
128 | * We try to cluster swap pages by allocating them sequentially | 176 | * We try to cluster swap pages by allocating them sequentially |
@@ -142,6 +190,19 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
142 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 190 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
143 | goto checks; | 191 | goto checks; |
144 | } | 192 | } |
193 | if (si->flags & SWP_DISCARDABLE) { | ||
194 | /* | ||
195 | * Start range check on racing allocations, in case | ||
196 | * they overlap the cluster we eventually decide on | ||
197 | * (we scan without swap_lock to allow preemption). | ||
198 | * It's hardly conceivable that cluster_nr could be | ||
199 | * wrapped during our scan, but don't depend on it. | ||
200 | */ | ||
201 | if (si->lowest_alloc) | ||
202 | goto checks; | ||
203 | si->lowest_alloc = si->max; | ||
204 | si->highest_alloc = 0; | ||
205 | } | ||
145 | spin_unlock(&swap_lock); | 206 | spin_unlock(&swap_lock); |
146 | 207 | ||
147 | offset = si->lowest_bit; | 208 | offset = si->lowest_bit; |
@@ -156,6 +217,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
156 | offset -= SWAPFILE_CLUSTER - 1; | 217 | offset -= SWAPFILE_CLUSTER - 1; |
157 | si->cluster_next = offset; | 218 | si->cluster_next = offset; |
158 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 219 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
220 | found_free_cluster = 1; | ||
159 | goto checks; | 221 | goto checks; |
160 | } | 222 | } |
161 | if (unlikely(--latency_ration < 0)) { | 223 | if (unlikely(--latency_ration < 0)) { |
@@ -167,6 +229,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
167 | offset = si->lowest_bit; | 229 | offset = si->lowest_bit; |
168 | spin_lock(&swap_lock); | 230 | spin_lock(&swap_lock); |
169 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 231 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
232 | si->lowest_alloc = 0; | ||
170 | } | 233 | } |
171 | 234 | ||
172 | checks: | 235 | checks: |
@@ -191,6 +254,60 @@ checks: | |||
191 | si->swap_map[offset] = 1; | 254 | si->swap_map[offset] = 1; |
192 | si->cluster_next = offset + 1; | 255 | si->cluster_next = offset + 1; |
193 | si->flags -= SWP_SCANNING; | 256 | si->flags -= SWP_SCANNING; |
257 | |||
258 | if (si->lowest_alloc) { | ||
259 | /* | ||
260 | * Only set when SWP_DISCARDABLE, and there's a scan | ||
261 | * for a free cluster in progress or just completed. | ||
262 | */ | ||
263 | if (found_free_cluster) { | ||
264 | /* | ||
265 | * To optimize wear-levelling, discard the | ||
266 | * old data of the cluster, taking care not to | ||
267 | * discard any of its pages that have already | ||
268 | * been allocated by racing tasks (offset has | ||
269 | * already stepped over any at the beginning). | ||
270 | */ | ||
271 | if (offset < si->highest_alloc && | ||
272 | si->lowest_alloc <= last_in_cluster) | ||
273 | last_in_cluster = si->lowest_alloc - 1; | ||
274 | si->flags |= SWP_DISCARDING; | ||
275 | spin_unlock(&swap_lock); | ||
276 | |||
277 | if (offset < last_in_cluster) | ||
278 | discard_swap_cluster(si, offset, | ||
279 | last_in_cluster - offset + 1); | ||
280 | |||
281 | spin_lock(&swap_lock); | ||
282 | si->lowest_alloc = 0; | ||
283 | si->flags &= ~SWP_DISCARDING; | ||
284 | |||
285 | smp_mb(); /* wake_up_bit advises this */ | ||
286 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
287 | |||
288 | } else if (si->flags & SWP_DISCARDING) { | ||
289 | /* | ||
290 | * Delay using pages allocated by racing tasks | ||
291 | * until the whole discard has been issued. We | ||
292 | * could defer that delay until swap_writepage, | ||
293 | * but it's easier to keep this self-contained. | ||
294 | */ | ||
295 | spin_unlock(&swap_lock); | ||
296 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
297 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
298 | spin_lock(&swap_lock); | ||
299 | } else { | ||
300 | /* | ||
301 | * Note pages allocated by racing tasks while | ||
302 | * scan for a free cluster is in progress, so | ||
303 | * that its final discard can exclude them. | ||
304 | */ | ||
305 | if (offset < si->lowest_alloc) | ||
306 | si->lowest_alloc = offset; | ||
307 | if (offset > si->highest_alloc) | ||
308 | si->highest_alloc = offset; | ||
309 | } | ||
310 | } | ||
194 | return offset; | 311 | return offset; |
195 | 312 | ||
196 | scan: | 313 | scan: |