aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorMiao Xie <miaox@cn.fujitsu.com>2011-01-05 05:07:28 -0500
committerChris Mason <chris.mason@oracle.com>2011-01-16 11:30:19 -0500
commitb2117a39fa96cf4814e7cab8c11494149ba6f29d (patch)
tree9327d1332d68f91931767ee7bc6233251ab41565 /fs
parent7bfc837df935d850fe996dfe92ef48975cd4170a (diff)
btrfs: make the chunk allocator utilize the devices better
With this patch, we change the handling method when we can not get enough free extents with default size. Implementation: 1. Look up the suitable free extent on each device and keep the search result. If not find a suitable free extent, keep the max free extent 2. If we get enough suitable free extents with default size, chunk allocation succeeds. 3. If we can not get enough free extents, but the number of the extent with default size is >= min_stripes, we just change the mapping information (reduce the number of stripes in the extent map), and chunk allocation succeeds. 4. If the number of the extent with default size is < min_stripes, sort the devices by its max free extent's size descending 5. Use the size of the max free extent on the (num_stripes - 1)th device as the stripe size to allocate the device space By this way, the chunk allocator can allocate chunks as large as possible when the devices' space is not enough and make full use of the devices. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/volumes.c379
-rw-r--r--fs/btrfs/volumes.h24
2 files changed, 300 insertions, 103 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4838bd395e49..c22784b989b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -877,7 +877,7 @@ out:
877 btrfs_free_path(path); 877 btrfs_free_path(path);
878error: 878error:
879 *start = max_hole_start; 879 *start = max_hole_start;
880 if (len && max_hole_size > *len) 880 if (len)
881 *len = max_hole_size; 881 *len = max_hole_size;
882 return ret; 882 return ret;
883} 883}
@@ -2176,70 +2176,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2176 return calc_size * num_stripes; 2176 return calc_size * num_stripes;
2177} 2177}
2178 2178
2179static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2179/* Used to sort the devices by max_avail(descending sort) */
2180 struct btrfs_root *extent_root, 2180int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2181 struct map_lookup **map_ret,
2182 u64 *num_bytes, u64 *stripe_size,
2183 u64 start, u64 type)
2184{ 2181{
2185 struct btrfs_fs_info *info = extent_root->fs_info; 2182 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2186 struct btrfs_device *device = NULL; 2183 ((struct btrfs_device_info *)dev_info2)->max_avail)
2187 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2184 return -1;
2188 struct list_head *cur; 2185 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2189 struct map_lookup *map = NULL; 2186 ((struct btrfs_device_info *)dev_info2)->max_avail)
2190 struct extent_map_tree *em_tree; 2187 return 1;
2191 struct extent_map *em; 2188 else
2192 struct list_head private_devs; 2189 return 0;
2193 int min_stripe_size = 1 * 1024 * 1024; 2190}
2194 u64 calc_size = 1024 * 1024 * 1024;
2195 u64 max_chunk_size = calc_size;
2196 u64 min_free;
2197 u64 avail;
2198 u64 max_avail = 0;
2199 u64 dev_offset;
2200 int num_stripes = 1;
2201 int min_stripes = 1;
2202 int sub_stripes = 0;
2203 int ncopies = 1;
2204 int looped = 0;
2205 int ret;
2206 int index;
2207 int stripe_len = 64 * 1024;
2208 2191
2209 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2192static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2210 (type & BTRFS_BLOCK_GROUP_DUP)) { 2193 int *num_stripes, int *min_stripes,
2211 WARN_ON(1); 2194 int *sub_stripes)
2212 type &= ~BTRFS_BLOCK_GROUP_DUP; 2195{
2213 } 2196 *num_stripes = 1;
2214 if (list_empty(&fs_devices->alloc_list)) 2197 *min_stripes = 1;
2215 return -ENOSPC; 2198 *sub_stripes = 0;
2216 2199
2217 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2200 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2218 num_stripes = fs_devices->rw_devices; 2201 *num_stripes = fs_devices->rw_devices;
2219 min_stripes = 2; 2202 *min_stripes = 2;
2220 } 2203 }
2221 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2204 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2222 num_stripes = 2; 2205 *num_stripes = 2;
2223 min_stripes = 2; 2206 *min_stripes = 2;
2224 ncopies = 2;
2225 } 2207 }
2226 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2208 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2227 if (fs_devices->rw_devices < 2) 2209 if (fs_devices->rw_devices < 2)
2228 return -ENOSPC; 2210 return -ENOSPC;
2229 num_stripes = 2; 2211 *num_stripes = 2;
2230 min_stripes = 2; 2212 *min_stripes = 2;
2231 ncopies = 2;
2232 } 2213 }
2233 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2214 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2234 num_stripes = fs_devices->rw_devices; 2215 *num_stripes = fs_devices->rw_devices;
2235 if (num_stripes < 4) 2216 if (*num_stripes < 4)
2236 return -ENOSPC; 2217 return -ENOSPC;
2237 num_stripes &= ~(u32)1; 2218 *num_stripes &= ~(u32)1;
2238 sub_stripes = 2; 2219 *sub_stripes = 2;
2239 ncopies = 2; 2220 *min_stripes = 4;
2240 min_stripes = 4;
2241 } 2221 }
2242 2222
2223 return 0;
2224}
2225
2226static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2227 u64 proposed_size, u64 type,
2228 int num_stripes, int small_stripe)
2229{
2230 int min_stripe_size = 1 * 1024 * 1024;
2231 u64 calc_size = proposed_size;
2232 u64 max_chunk_size = calc_size;
2233 int ncopies = 1;
2234
2235 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2236 BTRFS_BLOCK_GROUP_DUP |
2237 BTRFS_BLOCK_GROUP_RAID10))
2238 ncopies = 2;
2239
2243 if (type & BTRFS_BLOCK_GROUP_DATA) { 2240 if (type & BTRFS_BLOCK_GROUP_DATA) {
2244 max_chunk_size = 10 * calc_size; 2241 max_chunk_size = 10 * calc_size;
2245 min_stripe_size = 64 * 1024 * 1024; 2242 min_stripe_size = 64 * 1024 * 1024;
@@ -2256,51 +2253,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2256 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2253 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2257 max_chunk_size); 2254 max_chunk_size);
2258 2255
2259again:
2260 max_avail = 0;
2261 if (!map || map->num_stripes != num_stripes) {
2262 kfree(map);
2263 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2264 if (!map)
2265 return -ENOMEM;
2266 map->num_stripes = num_stripes;
2267 }
2268
2269 if (calc_size * num_stripes > max_chunk_size * ncopies) { 2256 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2270 calc_size = max_chunk_size * ncopies; 2257 calc_size = max_chunk_size * ncopies;
2271 do_div(calc_size, num_stripes); 2258 do_div(calc_size, num_stripes);
2272 do_div(calc_size, stripe_len); 2259 do_div(calc_size, BTRFS_STRIPE_LEN);
2273 calc_size *= stripe_len; 2260 calc_size *= BTRFS_STRIPE_LEN;
2274 } 2261 }
2275 2262
2276 /* we don't want tiny stripes */ 2263 /* we don't want tiny stripes */
2277 if (!looped) 2264 if (!small_stripe)
2278 calc_size = max_t(u64, min_stripe_size, calc_size); 2265 calc_size = max_t(u64, min_stripe_size, calc_size);
2279 2266
2280 /* 2267 /*
2281 * we're about to do_div by the stripe_len so lets make sure 2268 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2282 * we end up with something bigger than a stripe 2269 * we end up with something bigger than a stripe
2283 */ 2270 */
2284 calc_size = max_t(u64, calc_size, stripe_len * 4); 2271 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2272
2273 do_div(calc_size, BTRFS_STRIPE_LEN);
2274 calc_size *= BTRFS_STRIPE_LEN;
2275
2276 return calc_size;
2277}
2278
2279static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2280 int num_stripes)
2281{
2282 struct map_lookup *new;
2283 size_t len = map_lookup_size(num_stripes);
2284
2285 BUG_ON(map->num_stripes < num_stripes);
2286
2287 if (map->num_stripes == num_stripes)
2288 return map;
2289
2290 new = kmalloc(len, GFP_NOFS);
2291 if (!new) {
2292 /* just change map->num_stripes */
2293 map->num_stripes = num_stripes;
2294 return map;
2295 }
2296
2297 memcpy(new, map, len);
2298 new->num_stripes = num_stripes;
2299 kfree(map);
2300 return new;
2301}
2302
2303/*
2304 * helper to allocate device space from btrfs_device_info, in which we stored
2305 * max free space information of every device. It is used when we can not
2306 * allocate chunks by default size.
2307 *
2308 * By this helper, we can allocate a new chunk as larger as possible.
2309 */
2310static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2311 struct btrfs_fs_devices *fs_devices,
2312 struct btrfs_device_info *devices,
2313 int nr_device, u64 type,
2314 struct map_lookup **map_lookup,
2315 int min_stripes, u64 *stripe_size)
2316{
2317 int i, index, sort_again = 0;
2318 int min_devices = min_stripes;
2319 u64 max_avail, min_free;
2320 struct map_lookup *map = *map_lookup;
2321 int ret;
2322
2323 if (nr_device < min_stripes)
2324 return -ENOSPC;
2325
2326 btrfs_descending_sort_devices(devices, nr_device);
2327
2328 max_avail = devices[0].max_avail;
2329 if (!max_avail)
2330 return -ENOSPC;
2331
2332 for (i = 0; i < nr_device; i++) {
2333 /*
2334 * if dev_offset = 0, it means the free space of this device
2335 * is less than what we need, and we didn't search max avail
2336 * extent on this device, so do it now.
2337 */
2338 if (!devices[i].dev_offset) {
2339 ret = find_free_dev_extent(trans, devices[i].dev,
2340 max_avail,
2341 &devices[i].dev_offset,
2342 &devices[i].max_avail);
2343 if (ret != 0 && ret != -ENOSPC)
2344 return ret;
2345 sort_again = 1;
2346 }
2347 }
2348
2349 /* we update the max avail free extent of each devices, sort again */
2350 if (sort_again)
2351 btrfs_descending_sort_devices(devices, nr_device);
2352
2353 if (type & BTRFS_BLOCK_GROUP_DUP)
2354 min_devices = 1;
2355
2356 if (!devices[min_devices - 1].max_avail)
2357 return -ENOSPC;
2358
2359 max_avail = devices[min_devices - 1].max_avail;
2360 if (type & BTRFS_BLOCK_GROUP_DUP)
2361 do_div(max_avail, 2);
2285 2362
2286 do_div(calc_size, stripe_len); 2363 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2287 calc_size *= stripe_len; 2364 min_stripes, 1);
2365 if (type & BTRFS_BLOCK_GROUP_DUP)
2366 min_free = max_avail * 2;
2367 else
2368 min_free = max_avail;
2369
2370 if (min_free > devices[min_devices - 1].max_avail)
2371 return -ENOSPC;
2372
2373 map = __shrink_map_lookup_stripes(map, min_stripes);
2374 *stripe_size = max_avail;
2375
2376 index = 0;
2377 for (i = 0; i < min_stripes; i++) {
2378 map->stripes[i].dev = devices[index].dev;
2379 map->stripes[i].physical = devices[index].dev_offset;
2380 if (type & BTRFS_BLOCK_GROUP_DUP) {
2381 i++;
2382 map->stripes[i].dev = devices[index].dev;
2383 map->stripes[i].physical = devices[index].dev_offset +
2384 max_avail;
2385 }
2386 index++;
2387 }
2388 *map_lookup = map;
2389
2390 return 0;
2391}
2392
2393static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2394 struct btrfs_root *extent_root,
2395 struct map_lookup **map_ret,
2396 u64 *num_bytes, u64 *stripe_size,
2397 u64 start, u64 type)
2398{
2399 struct btrfs_fs_info *info = extent_root->fs_info;
2400 struct btrfs_device *device = NULL;
2401 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2402 struct list_head *cur;
2403 struct map_lookup *map;
2404 struct extent_map_tree *em_tree;
2405 struct extent_map *em;
2406 struct btrfs_device_info *devices_info;
2407 struct list_head private_devs;
2408 u64 calc_size = 1024 * 1024 * 1024;
2409 u64 min_free;
2410 u64 avail;
2411 u64 dev_offset;
2412 int num_stripes;
2413 int min_stripes;
2414 int sub_stripes;
2415 int min_devices; /* the min number of devices we need */
2416 int i;
2417 int ret;
2418 int index;
2419
2420 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2421 (type & BTRFS_BLOCK_GROUP_DUP)) {
2422 WARN_ON(1);
2423 type &= ~BTRFS_BLOCK_GROUP_DUP;
2424 }
2425 if (list_empty(&fs_devices->alloc_list))
2426 return -ENOSPC;
2427
2428 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2429 &min_stripes, &sub_stripes);
2430 if (ret)
2431 return ret;
2432
2433 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2434 GFP_NOFS);
2435 if (!devices_info)
2436 return -ENOMEM;
2437
2438 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2439 if (!map) {
2440 ret = -ENOMEM;
2441 goto error;
2442 }
2443 map->num_stripes = num_stripes;
2288 2444
2289 cur = fs_devices->alloc_list.next; 2445 cur = fs_devices->alloc_list.next;
2290 index = 0; 2446 index = 0;
2447 i = 0;
2291 2448
2292 if (type & BTRFS_BLOCK_GROUP_DUP) 2449 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2450 num_stripes, 0);
2451
2452 if (type & BTRFS_BLOCK_GROUP_DUP) {
2293 min_free = calc_size * 2; 2453 min_free = calc_size * 2;
2294 else 2454 min_devices = 1;
2455 } else {
2295 min_free = calc_size; 2456 min_free = calc_size;
2296 2457 min_devices = min_stripes;
2297 /* 2458 }
2298 * we add 1MB because we never use the first 1MB of the device, unless
2299 * we've looped, then we are likely allocating the maximum amount of
2300 * space left already
2301 */
2302 if (!looped)
2303 min_free += 1024 * 1024;
2304 2459
2305 INIT_LIST_HEAD(&private_devs); 2460 INIT_LIST_HEAD(&private_devs);
2306 while (index < num_stripes) { 2461 while (index < num_stripes) {
@@ -2313,27 +2468,39 @@ again:
2313 cur = cur->next; 2468 cur = cur->next;
2314 2469
2315 if (device->in_fs_metadata && avail >= min_free) { 2470 if (device->in_fs_metadata && avail >= min_free) {
2316 ret = find_free_dev_extent(trans, device, 2471 ret = find_free_dev_extent(trans, device, min_free,
2317 min_free, &dev_offset, 2472 &devices_info[i].dev_offset,
2318 &max_avail); 2473 &devices_info[i].max_avail);
2319 if (ret == 0) { 2474 if (ret == 0) {
2320 list_move_tail(&device->dev_alloc_list, 2475 list_move_tail(&device->dev_alloc_list,
2321 &private_devs); 2476 &private_devs);
2322 map->stripes[index].dev = device; 2477 map->stripes[index].dev = device;
2323 map->stripes[index].physical = dev_offset; 2478 map->stripes[index].physical =
2479 devices_info[i].dev_offset;
2324 index++; 2480 index++;
2325 if (type & BTRFS_BLOCK_GROUP_DUP) { 2481 if (type & BTRFS_BLOCK_GROUP_DUP) {
2326 map->stripes[index].dev = device; 2482 map->stripes[index].dev = device;
2327 map->stripes[index].physical = 2483 map->stripes[index].physical =
2328 dev_offset + calc_size; 2484 devices_info[i].dev_offset +
2485 calc_size;
2329 index++; 2486 index++;
2330 } 2487 }
2331 } 2488 } else if (ret != -ENOSPC)
2332 } else if (device->in_fs_metadata && avail > max_avail) 2489 goto error;
2333 max_avail = avail; 2490
2491 devices_info[i].dev = device;
2492 i++;
2493 } else if (device->in_fs_metadata &&
2494 avail >= BTRFS_STRIPE_LEN) {
2495 devices_info[i].dev = device;
2496 devices_info[i].max_avail = avail;
2497 i++;
2498 }
2499
2334 if (cur == &fs_devices->alloc_list) 2500 if (cur == &fs_devices->alloc_list)
2335 break; 2501 break;
2336 } 2502 }
2503
2337 list_splice(&private_devs, &fs_devices->alloc_list); 2504 list_splice(&private_devs, &fs_devices->alloc_list);
2338 if (index < num_stripes) { 2505 if (index < num_stripes) {
2339 if (index >= min_stripes) { 2506 if (index >= min_stripes) {
@@ -2342,36 +2509,36 @@ again:
2342 num_stripes /= sub_stripes; 2509 num_stripes /= sub_stripes;
2343 num_stripes *= sub_stripes; 2510 num_stripes *= sub_stripes;
2344 } 2511 }
2345 looped = 1; 2512
2346 goto again; 2513 map = __shrink_map_lookup_stripes(map, num_stripes);
2347 } 2514 } else if (i >= min_devices) {
2348 if (!looped && max_avail > 0) { 2515 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2349 looped = 1; 2516 devices_info, i, type,
2350 calc_size = max_avail; 2517 &map, min_stripes,
2351 if (type & BTRFS_BLOCK_GROUP_DUP) 2518 &calc_size);
2352 do_div(calc_size, 2); 2519 if (ret)
2353 goto again; 2520 goto error;
2521 } else {
2522 ret = -ENOSPC;
2523 goto error;
2354 } 2524 }
2355 kfree(map);
2356 return -ENOSPC;
2357 } 2525 }
2358 map->sector_size = extent_root->sectorsize; 2526 map->sector_size = extent_root->sectorsize;
2359 map->stripe_len = stripe_len; 2527 map->stripe_len = BTRFS_STRIPE_LEN;
2360 map->io_align = stripe_len; 2528 map->io_align = BTRFS_STRIPE_LEN;
2361 map->io_width = stripe_len; 2529 map->io_width = BTRFS_STRIPE_LEN;
2362 map->type = type; 2530 map->type = type;
2363 map->num_stripes = num_stripes;
2364 map->sub_stripes = sub_stripes; 2531 map->sub_stripes = sub_stripes;
2365 2532
2366 *map_ret = map; 2533 *map_ret = map;
2367 *stripe_size = calc_size; 2534 *stripe_size = calc_size;
2368 *num_bytes = chunk_bytes_by_type(type, calc_size, 2535 *num_bytes = chunk_bytes_by_type(type, calc_size,
2369 num_stripes, sub_stripes); 2536 map->num_stripes, sub_stripes);
2370 2537
2371 em = alloc_extent_map(GFP_NOFS); 2538 em = alloc_extent_map(GFP_NOFS);
2372 if (!em) { 2539 if (!em) {
2373 kfree(map); 2540 ret = -ENOMEM;
2374 return -ENOMEM; 2541 goto error;
2375 } 2542 }
2376 em->bdev = (struct block_device *)map; 2543 em->bdev = (struct block_device *)map;
2377 em->start = start; 2544 em->start = start;
@@ -2404,7 +2571,13 @@ again:
2404 index++; 2571 index++;
2405 } 2572 }
2406 2573
2574 kfree(devices_info);
2407 return 0; 2575 return 0;
2576
2577error:
2578 kfree(map);
2579 kfree(devices_info);
2580 return ret;
2408} 2581}
2409 2582
2410static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2583static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a668c0116982..a5cfedf393f9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -137,6 +140,27 @@ struct btrfs_multi_bio {
137 struct btrfs_bio_stripe stripes[]; 140 struct btrfs_bio_stripe stripes[];
138}; 141};
139 142
143struct btrfs_device_info {
144 struct btrfs_device *dev;
145 u64 dev_offset;
146 u64 max_avail;
147};
148
149/* Used to sort the devices by max_avail(descending sort) */
150int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
151
152/*
153 * sort the devices by max_avail, in which max free extent size of each device
154 * is stored.(Descending Sort)
155 */
156static inline void btrfs_descending_sort_devices(
157 struct btrfs_device_info *devices,
158 size_t nr_devices)
159{
160 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
161 btrfs_cmp_device_free_bytes, NULL);
162}
163
140#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 164#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
141 (sizeof(struct btrfs_bio_stripe) * (n))) 165 (sizeof(struct btrfs_bio_stripe) * (n)))
142 166