diff options
author | Aaron Lu <ziqian.lzq@antfin.com> | 2019-07-11 23:55:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-12 14:05:43 -0400 |
commit | 4efaceb1c5f8136d5fec3f26549d294b8e898bd7 (patch) | |
tree | 9bb4b1ee853f4b25dfc021ac2c9248a118b73fc5 /mm/swapfile.c | |
parent | 054f1d1faaed6a7930b77286d607ae45c01d0443 (diff) |
mm, swap: use rbtree for swap_extent
swap_extent is used to map swap page offset to backing device's block
offset. For a continuous block range, one swap_extent is used and all
these swap_extents are managed in a linked list.
These swap_extents are used by map_swap_entry() during swap's read and
write path. To find out the backing device's block offset for a page
offset, the swap_extent list will be traversed linearly, with
curr_swap_extent being used as a cache to speed up the search.
This works well as long as swap_extents are not huge or when the number
of processes that access swap device are few, but when the swap device
has many extents and there are a number of processes accessing the swap
device concurrently, it can be a problem. On one of our servers, the
disk's remaining size is tight:
$df -h
Filesystem Size Used Avail Use% Mounted on
... ...
/dev/nvme0n1p1 1.8T 1.3T 504G 72% /home/t4
When creating a 80G swapfile there, there are as many as 84656 swap
extents. The end result is, kernel spends abou 30% time in
map_swap_entry() and swap throughput is only 70MB/s.
As a comparison, when I used smaller sized swapfile, like 4G whose
swap_extent dropped to 2000, swap throughput is back to 400-500MB/s and
map_swap_entry() is about 3%.
One downside of using rbtree for swap_extent is, 'struct rbtree' takes
24 bytes while 'struct list_head' takes 16 bytes, that's 8 bytes more
for each swap_extent. For a swapfile that has 80k swap_extents, that
means 625KiB more memory consumed.
Test:
Since it's not possible to reboot that server, I can not test this patch
diretly there. Instead, I tested it on another server with NVMe disk.
I created a 20G swapfile on an NVMe backed XFS fs. By default, the
filesystem is quite clean and the created swapfile has only 2 extents.
Testing vanilla and this patch shows no obvious performance difference
when swapfile is not fragmented.
To see the patch's effects, I used some tweaks to manually fragment the
swapfile by breaking the extent at 1M boundary. This made the swapfile
have 20K extents.
nr_task=4
kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf)
vanilla 165191 90.77% 171798 90.21%
patched 858993 +420% 2.16% 715827 +317% 0.77%
nr_task=8
kernel swapout(KB/s) map_swap_entry(perf) swapin(KB/s) map_swap_entry(perf)
vanilla 306783 92.19% 318145 87.76%
patched 954437 +211% 2.35% 1073741 +237% 1.57%
swapout: the throughput of swap out, in KB/s, higher is better 1st
map_swap_entry: cpu cycles percent sampled by perf swapin: the
throughput of swap in, in KB/s, higher is better. 2nd map_swap_entry:
cpu cycles percent sampled by perf
nr_task=1 doesn't show any difference, this is due to the curr_swap_extent
can be effectively used to cache the correct swap extent for single task
workload.
[akpm@linux-foundation.org: s/BUG_ON(1)/BUG()/]
Link: http://lkml.kernel.org/r/20190523142404.GA181@aaronlu
Signed-off-by: Aaron Lu <ziqian.lzq@antfin.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 137 |
1 files changed, 75 insertions, 62 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index dbab16ddefa6..0789a762ce2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -152,6 +152,18 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, | |||
152 | return ret; | 152 | return ret; |
153 | } | 153 | } |
154 | 154 | ||
155 | static inline struct swap_extent *first_se(struct swap_info_struct *sis) | ||
156 | { | ||
157 | struct rb_node *rb = rb_first(&sis->swap_extent_root); | ||
158 | return rb_entry(rb, struct swap_extent, rb_node); | ||
159 | } | ||
160 | |||
161 | static inline struct swap_extent *next_se(struct swap_extent *se) | ||
162 | { | ||
163 | struct rb_node *rb = rb_next(&se->rb_node); | ||
164 | return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; | ||
165 | } | ||
166 | |||
155 | /* | 167 | /* |
156 | * swapon tell device that all the old swap contents can be discarded, | 168 | * swapon tell device that all the old swap contents can be discarded, |
157 | * to allow the swap device to optimize its wear-levelling. | 169 | * to allow the swap device to optimize its wear-levelling. |
@@ -164,7 +176,7 @@ static int discard_swap(struct swap_info_struct *si) | |||
164 | int err = 0; | 176 | int err = 0; |
165 | 177 | ||
166 | /* Do not discard the swap header page! */ | 178 | /* Do not discard the swap header page! */ |
167 | se = &si->first_swap_extent; | 179 | se = first_se(si); |
168 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); | 180 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
169 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | 181 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); |
170 | if (nr_blocks) { | 182 | if (nr_blocks) { |
@@ -175,7 +187,7 @@ static int discard_swap(struct swap_info_struct *si) | |||
175 | cond_resched(); | 187 | cond_resched(); |
176 | } | 188 | } |
177 | 189 | ||
178 | list_for_each_entry(se, &si->first_swap_extent.list, list) { | 190 | for (se = next_se(se); se; se = next_se(se)) { |
179 | start_block = se->start_block << (PAGE_SHIFT - 9); | 191 | start_block = se->start_block << (PAGE_SHIFT - 9); |
180 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 192 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
181 | 193 | ||
@@ -189,6 +201,26 @@ static int discard_swap(struct swap_info_struct *si) | |||
189 | return err; /* That will often be -EOPNOTSUPP */ | 201 | return err; /* That will often be -EOPNOTSUPP */ |
190 | } | 202 | } |
191 | 203 | ||
204 | static struct swap_extent * | ||
205 | offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) | ||
206 | { | ||
207 | struct swap_extent *se; | ||
208 | struct rb_node *rb; | ||
209 | |||
210 | rb = sis->swap_extent_root.rb_node; | ||
211 | while (rb) { | ||
212 | se = rb_entry(rb, struct swap_extent, rb_node); | ||
213 | if (offset < se->start_page) | ||
214 | rb = rb->rb_left; | ||
215 | else if (offset >= se->start_page + se->nr_pages) | ||
216 | rb = rb->rb_right; | ||
217 | else | ||
218 | return se; | ||
219 | } | ||
220 | /* It *must* be present */ | ||
221 | BUG(); | ||
222 | } | ||
223 | |||
192 | /* | 224 | /* |
193 | * swap allocation tell device that a cluster of swap can now be discarded, | 225 | * swap allocation tell device that a cluster of swap can now be discarded, |
194 | * to allow the swap device to optimize its wear-levelling. | 226 | * to allow the swap device to optimize its wear-levelling. |
@@ -196,32 +228,25 @@ static int discard_swap(struct swap_info_struct *si) | |||
196 | static void discard_swap_cluster(struct swap_info_struct *si, | 228 | static void discard_swap_cluster(struct swap_info_struct *si, |
197 | pgoff_t start_page, pgoff_t nr_pages) | 229 | pgoff_t start_page, pgoff_t nr_pages) |
198 | { | 230 | { |
199 | struct swap_extent *se = si->curr_swap_extent; | 231 | struct swap_extent *se = offset_to_swap_extent(si, start_page); |
200 | int found_extent = 0; | ||
201 | 232 | ||
202 | while (nr_pages) { | 233 | while (nr_pages) { |
203 | if (se->start_page <= start_page && | 234 | pgoff_t offset = start_page - se->start_page; |
204 | start_page < se->start_page + se->nr_pages) { | 235 | sector_t start_block = se->start_block + offset; |
205 | pgoff_t offset = start_page - se->start_page; | 236 | sector_t nr_blocks = se->nr_pages - offset; |
206 | sector_t start_block = se->start_block + offset; | 237 | |
207 | sector_t nr_blocks = se->nr_pages - offset; | 238 | if (nr_blocks > nr_pages) |
208 | 239 | nr_blocks = nr_pages; | |
209 | if (nr_blocks > nr_pages) | 240 | start_page += nr_blocks; |
210 | nr_blocks = nr_pages; | 241 | nr_pages -= nr_blocks; |
211 | start_page += nr_blocks; | 242 | |
212 | nr_pages -= nr_blocks; | 243 | start_block <<= PAGE_SHIFT - 9; |
213 | 244 | nr_blocks <<= PAGE_SHIFT - 9; | |
214 | if (!found_extent++) | 245 | if (blkdev_issue_discard(si->bdev, start_block, |
215 | si->curr_swap_extent = se; | 246 | nr_blocks, GFP_NOIO, 0)) |
216 | 247 | break; | |
217 | start_block <<= PAGE_SHIFT - 9; | ||
218 | nr_blocks <<= PAGE_SHIFT - 9; | ||
219 | if (blkdev_issue_discard(si->bdev, start_block, | ||
220 | nr_blocks, GFP_NOIO, 0)) | ||
221 | break; | ||
222 | } | ||
223 | 248 | ||
224 | se = list_next_entry(se, list); | 249 | se = next_se(se); |
225 | } | 250 | } |
226 | } | 251 | } |
227 | 252 | ||
@@ -1755,7 +1780,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
1755 | return type; | 1780 | return type; |
1756 | } | 1781 | } |
1757 | if (bdev == sis->bdev) { | 1782 | if (bdev == sis->bdev) { |
1758 | struct swap_extent *se = &sis->first_swap_extent; | 1783 | struct swap_extent *se = first_se(sis); |
1759 | 1784 | ||
1760 | if (se->start_block == offset) { | 1785 | if (se->start_block == offset) { |
1761 | if (bdev_p) | 1786 | if (bdev_p) |
@@ -2232,7 +2257,6 @@ static void drain_mmlist(void) | |||
2232 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) | 2257 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
2233 | { | 2258 | { |
2234 | struct swap_info_struct *sis; | 2259 | struct swap_info_struct *sis; |
2235 | struct swap_extent *start_se; | ||
2236 | struct swap_extent *se; | 2260 | struct swap_extent *se; |
2237 | pgoff_t offset; | 2261 | pgoff_t offset; |
2238 | 2262 | ||
@@ -2240,18 +2264,8 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) | |||
2240 | *bdev = sis->bdev; | 2264 | *bdev = sis->bdev; |
2241 | 2265 | ||
2242 | offset = swp_offset(entry); | 2266 | offset = swp_offset(entry); |
2243 | start_se = sis->curr_swap_extent; | 2267 | se = offset_to_swap_extent(sis, offset); |
2244 | se = start_se; | 2268 | return se->start_block + (offset - se->start_page); |
2245 | |||
2246 | for ( ; ; ) { | ||
2247 | if (se->start_page <= offset && | ||
2248 | offset < (se->start_page + se->nr_pages)) { | ||
2249 | return se->start_block + (offset - se->start_page); | ||
2250 | } | ||
2251 | se = list_next_entry(se, list); | ||
2252 | sis->curr_swap_extent = se; | ||
2253 | BUG_ON(se == start_se); /* It *must* be present */ | ||
2254 | } | ||
2255 | } | 2269 | } |
2256 | 2270 | ||
2257 | /* | 2271 | /* |
@@ -2269,12 +2283,11 @@ sector_t map_swap_page(struct page *page, struct block_device **bdev) | |||
2269 | */ | 2283 | */ |
2270 | static void destroy_swap_extents(struct swap_info_struct *sis) | 2284 | static void destroy_swap_extents(struct swap_info_struct *sis) |
2271 | { | 2285 | { |
2272 | while (!list_empty(&sis->first_swap_extent.list)) { | 2286 | while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { |
2273 | struct swap_extent *se; | 2287 | struct rb_node *rb = sis->swap_extent_root.rb_node; |
2288 | struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); | ||
2274 | 2289 | ||
2275 | se = list_first_entry(&sis->first_swap_extent.list, | 2290 | rb_erase(rb, &sis->swap_extent_root); |
2276 | struct swap_extent, list); | ||
2277 | list_del(&se->list); | ||
2278 | kfree(se); | 2291 | kfree(se); |
2279 | } | 2292 | } |
2280 | 2293 | ||
@@ -2290,7 +2303,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
2290 | 2303 | ||
2291 | /* | 2304 | /* |
2292 | * Add a block range (and the corresponding page range) into this swapdev's | 2305 | * Add a block range (and the corresponding page range) into this swapdev's |
2293 | * extent list. The extent list is kept sorted in page order. | 2306 | * extent tree. |
2294 | * | 2307 | * |
2295 | * This function rather assumes that it is called in ascending page order. | 2308 | * This function rather assumes that it is called in ascending page order. |
2296 | */ | 2309 | */ |
@@ -2298,20 +2311,21 @@ int | |||
2298 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 2311 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
2299 | unsigned long nr_pages, sector_t start_block) | 2312 | unsigned long nr_pages, sector_t start_block) |
2300 | { | 2313 | { |
2314 | struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; | ||
2301 | struct swap_extent *se; | 2315 | struct swap_extent *se; |
2302 | struct swap_extent *new_se; | 2316 | struct swap_extent *new_se; |
2303 | struct list_head *lh; | 2317 | |
2304 | 2318 | /* | |
2305 | if (start_page == 0) { | 2319 | * place the new node at the right most since the |
2306 | se = &sis->first_swap_extent; | 2320 | * function is called in ascending page order. |
2307 | sis->curr_swap_extent = se; | 2321 | */ |
2308 | se->start_page = 0; | 2322 | while (*link) { |
2309 | se->nr_pages = nr_pages; | 2323 | parent = *link; |
2310 | se->start_block = start_block; | 2324 | link = &parent->rb_right; |
2311 | return 1; | 2325 | } |
2312 | } else { | 2326 | |
2313 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | 2327 | if (parent) { |
2314 | se = list_entry(lh, struct swap_extent, list); | 2328 | se = rb_entry(parent, struct swap_extent, rb_node); |
2315 | BUG_ON(se->start_page + se->nr_pages != start_page); | 2329 | BUG_ON(se->start_page + se->nr_pages != start_page); |
2316 | if (se->start_block + se->nr_pages == start_block) { | 2330 | if (se->start_block + se->nr_pages == start_block) { |
2317 | /* Merge it */ | 2331 | /* Merge it */ |
@@ -2320,9 +2334,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
2320 | } | 2334 | } |
2321 | } | 2335 | } |
2322 | 2336 | ||
2323 | /* | 2337 | /* No merge, insert a new extent. */ |
2324 | * No merge. Insert a new extent, preserving ordering. | ||
2325 | */ | ||
2326 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); | 2338 | new_se = kmalloc(sizeof(*se), GFP_KERNEL); |
2327 | if (new_se == NULL) | 2339 | if (new_se == NULL) |
2328 | return -ENOMEM; | 2340 | return -ENOMEM; |
@@ -2330,7 +2342,8 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
2330 | new_se->nr_pages = nr_pages; | 2342 | new_se->nr_pages = nr_pages; |
2331 | new_se->start_block = start_block; | 2343 | new_se->start_block = start_block; |
2332 | 2344 | ||
2333 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); | 2345 | rb_link_node(&new_se->rb_node, parent, link); |
2346 | rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); | ||
2334 | return 1; | 2347 | return 1; |
2335 | } | 2348 | } |
2336 | EXPORT_SYMBOL_GPL(add_swap_extent); | 2349 | EXPORT_SYMBOL_GPL(add_swap_extent); |
@@ -2846,7 +2859,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2846 | * would be relying on p->type to remain valid. | 2859 | * would be relying on p->type to remain valid. |
2847 | */ | 2860 | */ |
2848 | } | 2861 | } |
2849 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2862 | p->swap_extent_root = RB_ROOT; |
2850 | plist_node_init(&p->list, 0); | 2863 | plist_node_init(&p->list, 0); |
2851 | for_each_node(i) | 2864 | for_each_node(i) |
2852 | plist_node_init(&p->avail_lists[i], 0); | 2865 | plist_node_init(&p->avail_lists[i], 0); |