summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorAaron Lu <ziqian.lzq@antfin.com>2019-07-11 23:55:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-12 14:05:43 -0400
commit4efaceb1c5f8136d5fec3f26549d294b8e898bd7 (patch)
tree9bb4b1ee853f4b25dfc021ac2c9248a118b73fc5 /mm/swapfile.c
parent054f1d1faaed6a7930b77286d607ae45c01d0443 (diff)
mm, swap: use rbtree for swap_extent
swap_extent is used to map swap page offset to backing device's block offset. For a continuous block range, one swap_extent is used and all these swap_extents are managed in a linked list. These swap_extents are used by map_swap_entry() during swap's read and write path. To find out the backing device's block offset for a page offset, the swap_extent list will be traversed linearly, with curr_swap_extent being used as a cache to speed up the search. This works well as long as swap_extents are not huge or when the number of processes that access swap device are few, but when the swap device has many extents and there are a number of processes accessing the swap device concurrently, it can be a problem. On one of our servers, the disk's remaining size is tight: $df -h Filesystem Size Used Avail Use% Mounted on ... ... /dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4 When creating a 80G swapfile there, there are as many as 84656 swap extents. The end result is, kernel spends abou 30% time in map_swap_entry() and swap throughput is only 70MB/s. As a comparison, when I used smaller sized swapfile, like 4G whose swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and map_swap_entry() is about 3%. One downside of using rbtree for swap_extent is, 'struct rbtree' takes 24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more for each swap_extent. For a swapfile that has 80k swap_extents, that means 625KiB more memory consumed. Test: Since it's not possible to reboot that server, I can not test this patch diretly there. Instead, I tested it on another server with NVMe disk. I created a 20G swapfile on an NVMe backed XFS fs. By default, the filesystem is quite clean and the created swapfile has only 2 extents. Testing vanilla and this patch shows no obvious performance difference when swapfile is not fragmented. To see the patch's effects, I used some tweaks to manually fragment the swapfile by breaking the extent at 1M boundary. This made the swapfile have 20K extents. nr_task=4 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 165191 90.77% 171798 90.21% patched 858993 +420% 2.16% 715827 +317% 0.77% nr_task=8 kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf) vanilla 306783 92.19% 318145 87.76% patched 954437 +211% 2.35% 1073741 +237% 1.57% swapout: the throughput of swap out, in KB/s, higher is better 1st map_swap_entry: cpu cycles percent sampled by perf swapin: the throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry: cpu cycles percent sampled by perf nr_task=1 doesn't show any difference, this is due to the curr_swap_extent can be effectively used to cache the correct swap extent for single task workload. [akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/] Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c137
1 files changed, 75 insertions, 62 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbab16ddefa6..0789a762ce2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
152 return ret; 152 return ret;
153} 153}
154 154
155static inline struct swap_extent *first_se(struct swap_info_struct *sis)
156{
157 struct rb_node *rb = rb_first(&sis->swap_extent_root);
158 return rb_entry(rb, struct swap_extent, rb_node);
159}
160
161static inline struct swap_extent *next_se(struct swap_extent *se)
162{
163 struct rb_node *rb = rb_next(&se->rb_node);
164 return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
165}
166
155/* 167/*
156 * swapon tell device that all the old swap contents can be discarded, 168 * swapon tell device that all the old swap contents can be discarded,
157 * to allow the swap device to optimize its wear-levelling. 169 * to allow the swap device to optimize its wear-levelling.
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si)
164 int err = 0; 176 int err = 0;
165 177
166 /* Do not discard the swap header page! */ 178 /* Do not discard the swap header page! */
167 se = &si->first_swap_extent; 179 se = first_se(si);
168 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); 180 start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
169 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); 181 nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
170 if (nr_blocks) { 182 if (nr_blocks) {
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si)
175 cond_resched(); 187 cond_resched();
176 } 188 }
177 189
178 list_for_each_entry(se, &si->first_swap_extent.list, list) { 190 for (se = next_se(se); se; se = next_se(se)) {
179 start_block = se->start_block << (PAGE_SHIFT - 9); 191 start_block = se->start_block << (PAGE_SHIFT - 9);
180 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); 192 nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
181 193
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si)
189 return err; /* That will often be -EOPNOTSUPP */ 201 return err; /* That will often be -EOPNOTSUPP */
190} 202}
191 203
204static struct swap_extent *
205offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
206{
207 struct swap_extent *se;
208 struct rb_node *rb;
209
210 rb = sis->swap_extent_root.rb_node;
211 while (rb) {
212 se = rb_entry(rb, struct swap_extent, rb_node);
213 if (offset < se->start_page)
214 rb = rb->rb_left;
215 else if (offset >= se->start_page + se->nr_pages)
216 rb = rb->rb_right;
217 else
218 return se;
219 }
220 /* It *must* be present */
221 BUG();
222}
223
192/* 224/*
193 * swap allocation tell device that a cluster of swap can now be discarded, 225 * swap allocation tell device that a cluster of swap can now be discarded,
194 * to allow the swap device to optimize its wear-levelling. 226 * to allow the swap device to optimize its wear-levelling.
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si)
196static void discard_swap_cluster(struct swap_info_struct *si, 228static void discard_swap_cluster(struct swap_info_struct *si,
197 pgoff_t start_page, pgoff_t nr_pages) 229 pgoff_t start_page, pgoff_t nr_pages)
198{ 230{
199 struct swap_extent *se = si->curr_swap_extent; 231 struct swap_extent *se = offset_to_swap_extent(si, start_page);
200 int found_extent = 0;
201 232
202 while (nr_pages) { 233 while (nr_pages) {
203 if (se->start_page <= start_page && 234 pgoff_t offset = start_page - se->start_page;
204 start_page < se->start_page + se->nr_pages) { 235 sector_t start_block = se->start_block + offset;
205 pgoff_t offset = start_page - se->start_page; 236 sector_t nr_blocks = se->nr_pages - offset;
206 sector_t start_block = se->start_block + offset; 237
207 sector_t nr_blocks = se->nr_pages - offset; 238 if (nr_blocks > nr_pages)
208 239 nr_blocks = nr_pages;
209 if (nr_blocks > nr_pages) 240 start_page += nr_blocks;
210 nr_blocks = nr_pages; 241 nr_pages -= nr_blocks;
211 start_page += nr_blocks; 242
212 nr_pages -= nr_blocks; 243 start_block <<= PAGE_SHIFT - 9;
213 244 nr_blocks <<= PAGE_SHIFT - 9;
214 if (!found_extent++) 245 if (blkdev_issue_discard(si->bdev, start_block,
215 si->curr_swap_extent = se; 246 nr_blocks, GFP_NOIO, 0))
216 247 break;
217 start_block <<= PAGE_SHIFT - 9;
218 nr_blocks <<= PAGE_SHIFT - 9;
219 if (blkdev_issue_discard(si->bdev, start_block,
220 nr_blocks, GFP_NOIO, 0))
221 break;
222 }
223 248
224 se = list_next_entry(se, list); 249 se = next_se(se);
225 } 250 }
226} 251}
227 252
@@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
1755 return type; 1780 return type;
1756 } 1781 }
1757 if (bdev == sis->bdev) { 1782 if (bdev == sis->bdev) {
1758 struct swap_extent *se = &sis->first_swap_extent; 1783 struct swap_extent *se = first_se(sis);
1759 1784
1760 if (se->start_block == offset) { 1785 if (se->start_block == offset) {
1761 if (bdev_p) 1786 if (bdev_p)
@@ -2232,7 +2257,6 @@ static void drain_mmlist(void)
2232static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) 2257static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2233{ 2258{
2234 struct swap_info_struct *sis; 2259 struct swap_info_struct *sis;
2235 struct swap_extent *start_se;
2236 struct swap_extent *se; 2260 struct swap_extent *se;
2237 pgoff_t offset; 2261 pgoff_t offset;
2238 2262
@@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
2240 *bdev = sis->bdev; 2264 *bdev = sis->bdev;
2241 2265
2242 offset = swp_offset(entry); 2266 offset = swp_offset(entry);
2243 start_se = sis->curr_swap_extent; 2267 se = offset_to_swap_extent(sis, offset);
2244 se = start_se; 2268 return se->start_block + (offset - se->start_page);
2245
2246 for ( ; ; ) {
2247 if (se->start_page <= offset &&
2248 offset < (se->start_page + se->nr_pages)) {
2249 return se->start_block + (offset - se->start_page);
2250 }
2251 se = list_next_entry(se, list);
2252 sis->curr_swap_extent = se;
2253 BUG_ON(se == start_se); /* It *must* be present */
2254 }
2255} 2269}
2256 2270
2257/* 2271/*
@@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev)
2269 */ 2283 */
2270static void destroy_swap_extents(struct swap_info_struct *sis) 2284static void destroy_swap_extents(struct swap_info_struct *sis)
2271{ 2285{
2272 while (!list_empty(&sis->first_swap_extent.list)) { 2286 while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
2273 struct swap_extent *se; 2287 struct rb_node *rb = sis->swap_extent_root.rb_node;
2288 struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
2274 2289
2275 se = list_first_entry(&sis->first_swap_extent.list, 2290 rb_erase(rb, &sis->swap_extent_root);
2276 struct swap_extent, list);
2277 list_del(&se->list);
2278 kfree(se); 2291 kfree(se);
2279 } 2292 }
2280 2293
@@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
2290 2303
2291/* 2304/*
2292 * Add a block range (and the corresponding page range) into this swapdev's 2305 * Add a block range (and the corresponding page range) into this swapdev's
2293 * extent list. The extent list is kept sorted in page order. 2306 * extent tree.
2294 * 2307 *
2295 * This function rather assumes that it is called in ascending page order. 2308 * This function rather assumes that it is called in ascending page order.
2296 */ 2309 */
@@ -2298,20 +2311,21 @@ int
2298add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 2311add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2299 unsigned long nr_pages, sector_t start_block) 2312 unsigned long nr_pages, sector_t start_block)
2300{ 2313{
2314 struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
2301 struct swap_extent *se; 2315 struct swap_extent *se;
2302 struct swap_extent *new_se; 2316 struct swap_extent *new_se;
2303 struct list_head *lh; 2317
2304 2318 /*
2305 if (start_page == 0) { 2319 * place the new node at the right most since the
2306 se = &sis->first_swap_extent; 2320 * function is called in ascending page order.
2307 sis->curr_swap_extent = se; 2321 */
2308 se->start_page = 0; 2322 while (*link) {
2309 se->nr_pages = nr_pages; 2323 parent = *link;
2310 se->start_block = start_block; 2324 link = &parent->rb_right;
2311 return 1; 2325 }
2312 } else { 2326
2313 lh = sis->first_swap_extent.list.prev; /* Highest extent */ 2327 if (parent) {
2314 se = list_entry(lh, struct swap_extent, list); 2328 se = rb_entry(parent, struct swap_extent, rb_node);
2315 BUG_ON(se->start_page + se->nr_pages != start_page); 2329 BUG_ON(se->start_page + se->nr_pages != start_page);
2316 if (se->start_block + se->nr_pages == start_block) { 2330 if (se->start_block + se->nr_pages == start_block) {
2317 /* Merge it */ 2331 /* Merge it */
@@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2320 } 2334 }
2321 } 2335 }
2322 2336
2323 /* 2337 /* No merge, insert a new extent. */
2324 * No merge. Insert a new extent, preserving ordering.
2325 */
2326 new_se = kmalloc(sizeof(*se), GFP_KERNEL); 2338 new_se = kmalloc(sizeof(*se), GFP_KERNEL);
2327 if (new_se == NULL) 2339 if (new_se == NULL)
2328 return -ENOMEM; 2340 return -ENOMEM;
@@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
2330 new_se->nr_pages = nr_pages; 2342 new_se->nr_pages = nr_pages;
2331 new_se->start_block = start_block; 2343 new_se->start_block = start_block;
2332 2344
2333 list_add_tail(&new_se->list, &sis->first_swap_extent.list); 2345 rb_link_node(&new_se->rb_node, parent, link);
2346 rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
2334 return 1; 2347 return 1;
2335} 2348}
2336EXPORT_SYMBOL_GPL(add_swap_extent); 2349EXPORT_SYMBOL_GPL(add_swap_extent);
@@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void)
2846 * would be relying on p->type to remain valid. 2859 * would be relying on p->type to remain valid.
2847 */ 2860 */
2848 } 2861 }
2849 INIT_LIST_HEAD(&p->first_swap_extent.list); 2862 p->swap_extent_root = RB_ROOT;
2850 plist_node_init(&p->list, 0); 2863 plist_node_init(&p->list, 0);
2851 for_each_node(i) 2864 for_each_node(i)
2852 plist_node_init(&p->avail_lists[i], 0); 2865 plist_node_init(&p->avail_lists[i], 0);