diff options
Diffstat (limited to 'fs')
163 files changed, 8550 insertions, 4041 deletions
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 9c5e6b2cd11..c2183f3917c 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c | |||
@@ -22,6 +22,7 @@ | |||
22 | 22 | ||
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/mempool.h> | 24 | #include <linux/mempool.h> |
25 | #include <linux/export.h> | ||
25 | #include <linux/bio.h> | 26 | #include <linux/bio.h> |
26 | #include <linux/workqueue.h> | 27 | #include <linux/workqueue.h> |
27 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
@@ -255,7 +255,6 @@ void bio_init(struct bio *bio) | |||
255 | { | 255 | { |
256 | memset(bio, 0, sizeof(*bio)); | 256 | memset(bio, 0, sizeof(*bio)); |
257 | bio->bi_flags = 1 << BIO_UPTODATE; | 257 | bio->bi_flags = 1 << BIO_UPTODATE; |
258 | bio->bi_comp_cpu = -1; | ||
259 | atomic_set(&bio->bi_cnt, 1); | 258 | atomic_set(&bio->bi_cnt, 1); |
260 | } | 259 | } |
261 | EXPORT_SYMBOL(bio_init); | 260 | EXPORT_SYMBOL(bio_init); |
@@ -338,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) | |||
338 | * RETURNS: | 337 | * RETURNS: |
339 | * Pointer to new bio on success, NULL on failure. | 338 | * Pointer to new bio on success, NULL on failure. |
340 | */ | 339 | */ |
341 | struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) | 340 | struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) |
342 | { | 341 | { |
343 | struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); | 342 | struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); |
344 | 343 | ||
@@ -366,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) | |||
366 | * %__GFP_WAIT, the allocation is guaranteed to succeed. | 365 | * %__GFP_WAIT, the allocation is guaranteed to succeed. |
367 | * | 366 | * |
368 | **/ | 367 | **/ |
369 | struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) | 368 | struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) |
370 | { | 369 | { |
371 | struct bio *bio; | 370 | struct bio *bio; |
372 | 371 | ||
@@ -697,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) | |||
697 | kfree(bmd); | 696 | kfree(bmd); |
698 | } | 697 | } |
699 | 698 | ||
700 | static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, | 699 | static struct bio_map_data *bio_alloc_map_data(int nr_segs, |
700 | unsigned int iov_count, | ||
701 | gfp_t gfp_mask) | 701 | gfp_t gfp_mask) |
702 | { | 702 | { |
703 | struct bio_map_data *bmd; | 703 | struct bio_map_data *bmd; |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 95f786ec7f0..b07f1da1de4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -971,7 +971,7 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty) | |||
971 | 971 | ||
972 | if (!bdev->bd_disk) | 972 | if (!bdev->bd_disk) |
973 | return; | 973 | return; |
974 | if (disk_partitionable(bdev->bd_disk)) | 974 | if (disk_part_scan_enabled(bdev->bd_disk)) |
975 | bdev->bd_invalidated = 1; | 975 | bdev->bd_invalidated = 1; |
976 | } | 976 | } |
977 | 977 | ||
@@ -1085,6 +1085,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); | |||
1085 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | 1085 | static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) |
1086 | { | 1086 | { |
1087 | struct gendisk *disk; | 1087 | struct gendisk *disk; |
1088 | struct module *owner; | ||
1088 | int ret; | 1089 | int ret; |
1089 | int partno; | 1090 | int partno; |
1090 | int perm = 0; | 1091 | int perm = 0; |
@@ -1110,6 +1111,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1110 | disk = get_gendisk(bdev->bd_dev, &partno); | 1111 | disk = get_gendisk(bdev->bd_dev, &partno); |
1111 | if (!disk) | 1112 | if (!disk) |
1112 | goto out; | 1113 | goto out; |
1114 | owner = disk->fops->owner; | ||
1113 | 1115 | ||
1114 | disk_block_events(disk); | 1116 | disk_block_events(disk); |
1115 | mutex_lock_nested(&bdev->bd_mutex, for_part); | 1117 | mutex_lock_nested(&bdev->bd_mutex, for_part); |
@@ -1137,8 +1139,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1137 | bdev->bd_disk = NULL; | 1139 | bdev->bd_disk = NULL; |
1138 | mutex_unlock(&bdev->bd_mutex); | 1140 | mutex_unlock(&bdev->bd_mutex); |
1139 | disk_unblock_events(disk); | 1141 | disk_unblock_events(disk); |
1140 | module_put(disk->fops->owner); | ||
1141 | put_disk(disk); | 1142 | put_disk(disk); |
1143 | module_put(owner); | ||
1142 | goto restart; | 1144 | goto restart; |
1143 | } | 1145 | } |
1144 | } | 1146 | } |
@@ -1194,8 +1196,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1194 | goto out_unlock_bdev; | 1196 | goto out_unlock_bdev; |
1195 | } | 1197 | } |
1196 | /* only one opener holds refs to the module and disk */ | 1198 | /* only one opener holds refs to the module and disk */ |
1197 | module_put(disk->fops->owner); | ||
1198 | put_disk(disk); | 1199 | put_disk(disk); |
1200 | module_put(owner); | ||
1199 | } | 1201 | } |
1200 | bdev->bd_openers++; | 1202 | bdev->bd_openers++; |
1201 | if (for_part) | 1203 | if (for_part) |
@@ -1215,8 +1217,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1215 | out_unlock_bdev: | 1217 | out_unlock_bdev: |
1216 | mutex_unlock(&bdev->bd_mutex); | 1218 | mutex_unlock(&bdev->bd_mutex); |
1217 | disk_unblock_events(disk); | 1219 | disk_unblock_events(disk); |
1218 | module_put(disk->fops->owner); | ||
1219 | put_disk(disk); | 1220 | put_disk(disk); |
1221 | module_put(owner); | ||
1220 | out: | 1222 | out: |
1221 | bdput(bdev); | 1223 | bdput(bdev); |
1222 | 1224 | ||
@@ -1442,14 +1444,15 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) | |||
1442 | if (!bdev->bd_openers) { | 1444 | if (!bdev->bd_openers) { |
1443 | struct module *owner = disk->fops->owner; | 1445 | struct module *owner = disk->fops->owner; |
1444 | 1446 | ||
1445 | put_disk(disk); | ||
1446 | module_put(owner); | ||
1447 | disk_put_part(bdev->bd_part); | 1447 | disk_put_part(bdev->bd_part); |
1448 | bdev->bd_part = NULL; | 1448 | bdev->bd_part = NULL; |
1449 | bdev->bd_disk = NULL; | 1449 | bdev->bd_disk = NULL; |
1450 | if (bdev != bdev->bd_contains) | 1450 | if (bdev != bdev->bd_contains) |
1451 | victim = bdev->bd_contains; | 1451 | victim = bdev->bd_contains; |
1452 | bdev->bd_contains = NULL; | 1452 | bdev->bd_contains = NULL; |
1453 | |||
1454 | put_disk(disk); | ||
1455 | module_put(owner); | ||
1453 | } | 1456 | } |
1454 | mutex_unlock(&bdev->bd_mutex); | 1457 | mutex_unlock(&bdev->bd_mutex); |
1455 | bdput(bdev); | 1458 | bdput(bdev); |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 40e6ac08c21..c0ddfd29c5e 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
7 | extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ | 7 | extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ |
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o | ||
11 | 12 | ||
12 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index eb159aaa5a1..89b156d85d6 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c | |||
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) | |||
59 | if (!value) | 59 | if (!value) |
60 | return ERR_PTR(-ENOMEM); | 60 | return ERR_PTR(-ENOMEM); |
61 | size = __btrfs_getxattr(inode, name, value, size); | 61 | size = __btrfs_getxattr(inode, name, value, size); |
62 | if (size > 0) { | 62 | } |
63 | acl = posix_acl_from_xattr(value, size); | 63 | if (size > 0) { |
64 | if (IS_ERR(acl)) { | 64 | acl = posix_acl_from_xattr(value, size); |
65 | kfree(value); | ||
66 | return acl; | ||
67 | } | ||
68 | set_cached_acl(inode, type, acl); | ||
69 | } | ||
70 | kfree(value); | ||
71 | } else if (size == -ENOENT || size == -ENODATA || size == 0) { | 65 | } else if (size == -ENOENT || size == -ENODATA || size == 0) { |
72 | /* FIXME, who returns -ENOENT? I think nobody */ | 66 | /* FIXME, who returns -ENOENT? I think nobody */ |
73 | acl = NULL; | 67 | acl = NULL; |
74 | set_cached_acl(inode, type, acl); | ||
75 | } else { | 68 | } else { |
76 | acl = ERR_PTR(-EIO); | 69 | acl = ERR_PTR(-EIO); |
77 | } | 70 | } |
71 | kfree(value); | ||
72 | |||
73 | if (!IS_ERR(acl)) | ||
74 | set_cached_acl(inode, type, acl); | ||
78 | 75 | ||
79 | return acl; | 76 | return acl; |
80 | } | 77 | } |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c new file mode 100644 index 00000000000..22c64fff1bd --- /dev/null +++ b/fs/btrfs/backref.c | |||
@@ -0,0 +1,776 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 STRATO. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include "ctree.h" | ||
20 | #include "disk-io.h" | ||
21 | #include "backref.h" | ||
22 | |||
23 | struct __data_ref { | ||
24 | struct list_head list; | ||
25 | u64 inum; | ||
26 | u64 root; | ||
27 | u64 extent_data_item_offset; | ||
28 | }; | ||
29 | |||
30 | struct __shared_ref { | ||
31 | struct list_head list; | ||
32 | u64 disk_byte; | ||
33 | }; | ||
34 | |||
35 | static int __inode_info(u64 inum, u64 ioff, u8 key_type, | ||
36 | struct btrfs_root *fs_root, struct btrfs_path *path, | ||
37 | struct btrfs_key *found_key) | ||
38 | { | ||
39 | int ret; | ||
40 | struct btrfs_key key; | ||
41 | struct extent_buffer *eb; | ||
42 | |||
43 | key.type = key_type; | ||
44 | key.objectid = inum; | ||
45 | key.offset = ioff; | ||
46 | |||
47 | ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); | ||
48 | if (ret < 0) | ||
49 | return ret; | ||
50 | |||
51 | eb = path->nodes[0]; | ||
52 | if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { | ||
53 | ret = btrfs_next_leaf(fs_root, path); | ||
54 | if (ret) | ||
55 | return ret; | ||
56 | eb = path->nodes[0]; | ||
57 | } | ||
58 | |||
59 | btrfs_item_key_to_cpu(eb, found_key, path->slots[0]); | ||
60 | if (found_key->type != key.type || found_key->objectid != key.objectid) | ||
61 | return 1; | ||
62 | |||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * this makes the path point to (inum INODE_ITEM ioff) | ||
68 | */ | ||
69 | int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, | ||
70 | struct btrfs_path *path) | ||
71 | { | ||
72 | struct btrfs_key key; | ||
73 | return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path, | ||
74 | &key); | ||
75 | } | ||
76 | |||
77 | static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, | ||
78 | struct btrfs_path *path, | ||
79 | struct btrfs_key *found_key) | ||
80 | { | ||
81 | return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path, | ||
82 | found_key); | ||
83 | } | ||
84 | |||
85 | /* | ||
86 | * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements | ||
87 | * of the path are separated by '/' and the path is guaranteed to be | ||
88 | * 0-terminated. the path is only given within the current file system. | ||
89 | * Therefore, it never starts with a '/'. the caller is responsible to provide | ||
90 | * "size" bytes in "dest". the dest buffer will be filled backwards. finally, | ||
91 | * the start point of the resulting string is returned. this pointer is within | ||
92 | * dest, normally. | ||
93 | * in case the path buffer would overflow, the pointer is decremented further | ||
94 | * as if output was written to the buffer, though no more output is actually | ||
95 | * generated. that way, the caller can determine how much space would be | ||
96 | * required for the path to fit into the buffer. in that case, the returned | ||
97 | * value will be smaller than dest. callers must check this! | ||
98 | */ | ||
99 | static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, | ||
100 | struct btrfs_inode_ref *iref, | ||
101 | struct extent_buffer *eb_in, u64 parent, | ||
102 | char *dest, u32 size) | ||
103 | { | ||
104 | u32 len; | ||
105 | int slot; | ||
106 | u64 next_inum; | ||
107 | int ret; | ||
108 | s64 bytes_left = size - 1; | ||
109 | struct extent_buffer *eb = eb_in; | ||
110 | struct btrfs_key found_key; | ||
111 | |||
112 | if (bytes_left >= 0) | ||
113 | dest[bytes_left] = '\0'; | ||
114 | |||
115 | while (1) { | ||
116 | len = btrfs_inode_ref_name_len(eb, iref); | ||
117 | bytes_left -= len; | ||
118 | if (bytes_left >= 0) | ||
119 | read_extent_buffer(eb, dest + bytes_left, | ||
120 | (unsigned long)(iref + 1), len); | ||
121 | if (eb != eb_in) | ||
122 | free_extent_buffer(eb); | ||
123 | ret = inode_ref_info(parent, 0, fs_root, path, &found_key); | ||
124 | if (ret) | ||
125 | break; | ||
126 | next_inum = found_key.offset; | ||
127 | |||
128 | /* regular exit ahead */ | ||
129 | if (parent == next_inum) | ||
130 | break; | ||
131 | |||
132 | slot = path->slots[0]; | ||
133 | eb = path->nodes[0]; | ||
134 | /* make sure we can use eb after releasing the path */ | ||
135 | if (eb != eb_in) | ||
136 | atomic_inc(&eb->refs); | ||
137 | btrfs_release_path(path); | ||
138 | |||
139 | iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); | ||
140 | parent = next_inum; | ||
141 | --bytes_left; | ||
142 | if (bytes_left >= 0) | ||
143 | dest[bytes_left] = '/'; | ||
144 | } | ||
145 | |||
146 | btrfs_release_path(path); | ||
147 | |||
148 | if (ret) | ||
149 | return ERR_PTR(ret); | ||
150 | |||
151 | return dest + bytes_left; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * this makes the path point to (logical EXTENT_ITEM *) | ||
156 | * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for | ||
157 | * tree blocks and <0 on error. | ||
158 | */ | ||
159 | int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, | ||
160 | struct btrfs_path *path, struct btrfs_key *found_key) | ||
161 | { | ||
162 | int ret; | ||
163 | u64 flags; | ||
164 | u32 item_size; | ||
165 | struct extent_buffer *eb; | ||
166 | struct btrfs_extent_item *ei; | ||
167 | struct btrfs_key key; | ||
168 | |||
169 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
170 | key.objectid = logical; | ||
171 | key.offset = (u64)-1; | ||
172 | |||
173 | ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); | ||
174 | if (ret < 0) | ||
175 | return ret; | ||
176 | ret = btrfs_previous_item(fs_info->extent_root, path, | ||
177 | 0, BTRFS_EXTENT_ITEM_KEY); | ||
178 | if (ret < 0) | ||
179 | return ret; | ||
180 | |||
181 | btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); | ||
182 | if (found_key->type != BTRFS_EXTENT_ITEM_KEY || | ||
183 | found_key->objectid > logical || | ||
184 | found_key->objectid + found_key->offset <= logical) | ||
185 | return -ENOENT; | ||
186 | |||
187 | eb = path->nodes[0]; | ||
188 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
189 | BUG_ON(item_size < sizeof(*ei)); | ||
190 | |||
191 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
192 | flags = btrfs_extent_flags(eb, ei); | ||
193 | |||
194 | if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) | ||
195 | return BTRFS_EXTENT_FLAG_TREE_BLOCK; | ||
196 | if (flags & BTRFS_EXTENT_FLAG_DATA) | ||
197 | return BTRFS_EXTENT_FLAG_DATA; | ||
198 | |||
199 | return -EIO; | ||
200 | } | ||
201 | |||
202 | /* | ||
203 | * helper function to iterate extent inline refs. ptr must point to a 0 value | ||
204 | * for the first call and may be modified. it is used to track state. | ||
205 | * if more refs exist, 0 is returned and the next call to | ||
206 | * __get_extent_inline_ref must pass the modified ptr parameter to get the | ||
207 | * next ref. after the last ref was processed, 1 is returned. | ||
208 | * returns <0 on error | ||
209 | */ | ||
210 | static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, | ||
211 | struct btrfs_extent_item *ei, u32 item_size, | ||
212 | struct btrfs_extent_inline_ref **out_eiref, | ||
213 | int *out_type) | ||
214 | { | ||
215 | unsigned long end; | ||
216 | u64 flags; | ||
217 | struct btrfs_tree_block_info *info; | ||
218 | |||
219 | if (!*ptr) { | ||
220 | /* first call */ | ||
221 | flags = btrfs_extent_flags(eb, ei); | ||
222 | if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
223 | info = (struct btrfs_tree_block_info *)(ei + 1); | ||
224 | *out_eiref = | ||
225 | (struct btrfs_extent_inline_ref *)(info + 1); | ||
226 | } else { | ||
227 | *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1); | ||
228 | } | ||
229 | *ptr = (unsigned long)*out_eiref; | ||
230 | if ((void *)*ptr >= (void *)ei + item_size) | ||
231 | return -ENOENT; | ||
232 | } | ||
233 | |||
234 | end = (unsigned long)ei + item_size; | ||
235 | *out_eiref = (struct btrfs_extent_inline_ref *)*ptr; | ||
236 | *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); | ||
237 | |||
238 | *ptr += btrfs_extent_inline_ref_size(*out_type); | ||
239 | WARN_ON(*ptr > end); | ||
240 | if (*ptr == end) | ||
241 | return 1; /* last */ | ||
242 | |||
243 | return 0; | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * reads the tree block backref for an extent. tree level and root are returned | ||
248 | * through out_level and out_root. ptr must point to a 0 value for the first | ||
249 | * call and may be modified (see __get_extent_inline_ref comment). | ||
250 | * returns 0 if data was provided, 1 if there was no more data to provide or | ||
251 | * <0 on error. | ||
252 | */ | ||
253 | int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, | ||
254 | struct btrfs_extent_item *ei, u32 item_size, | ||
255 | u64 *out_root, u8 *out_level) | ||
256 | { | ||
257 | int ret; | ||
258 | int type; | ||
259 | struct btrfs_tree_block_info *info; | ||
260 | struct btrfs_extent_inline_ref *eiref; | ||
261 | |||
262 | if (*ptr == (unsigned long)-1) | ||
263 | return 1; | ||
264 | |||
265 | while (1) { | ||
266 | ret = __get_extent_inline_ref(ptr, eb, ei, item_size, | ||
267 | &eiref, &type); | ||
268 | if (ret < 0) | ||
269 | return ret; | ||
270 | |||
271 | if (type == BTRFS_TREE_BLOCK_REF_KEY || | ||
272 | type == BTRFS_SHARED_BLOCK_REF_KEY) | ||
273 | break; | ||
274 | |||
275 | if (ret == 1) | ||
276 | return 1; | ||
277 | } | ||
278 | |||
279 | /* we can treat both ref types equally here */ | ||
280 | info = (struct btrfs_tree_block_info *)(ei + 1); | ||
281 | *out_root = btrfs_extent_inline_ref_offset(eb, eiref); | ||
282 | *out_level = btrfs_tree_block_level(eb, info); | ||
283 | |||
284 | if (ret == 1) | ||
285 | *ptr = (unsigned long)-1; | ||
286 | |||
287 | return 0; | ||
288 | } | ||
289 | |||
290 | static int __data_list_add(struct list_head *head, u64 inum, | ||
291 | u64 extent_data_item_offset, u64 root) | ||
292 | { | ||
293 | struct __data_ref *ref; | ||
294 | |||
295 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | ||
296 | if (!ref) | ||
297 | return -ENOMEM; | ||
298 | |||
299 | ref->inum = inum; | ||
300 | ref->extent_data_item_offset = extent_data_item_offset; | ||
301 | ref->root = root; | ||
302 | list_add_tail(&ref->list, head); | ||
303 | |||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb, | ||
308 | struct btrfs_extent_data_ref *dref) | ||
309 | { | ||
310 | return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref), | ||
311 | btrfs_extent_data_ref_offset(eb, dref), | ||
312 | btrfs_extent_data_ref_root(eb, dref)); | ||
313 | } | ||
314 | |||
315 | static int __shared_list_add(struct list_head *head, u64 disk_byte) | ||
316 | { | ||
317 | struct __shared_ref *ref; | ||
318 | |||
319 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | ||
320 | if (!ref) | ||
321 | return -ENOMEM; | ||
322 | |||
323 | ref->disk_byte = disk_byte; | ||
324 | list_add_tail(&ref->list, head); | ||
325 | |||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info, | ||
330 | u64 logical, u64 inum, | ||
331 | u64 extent_data_item_offset, | ||
332 | u64 extent_offset, | ||
333 | struct btrfs_path *path, | ||
334 | struct list_head *data_refs, | ||
335 | iterate_extent_inodes_t *iterate, | ||
336 | void *ctx) | ||
337 | { | ||
338 | u64 ref_root; | ||
339 | u32 item_size; | ||
340 | struct btrfs_key key; | ||
341 | struct extent_buffer *eb; | ||
342 | struct btrfs_extent_item *ei; | ||
343 | struct btrfs_extent_inline_ref *eiref; | ||
344 | struct __data_ref *ref; | ||
345 | int ret; | ||
346 | int type; | ||
347 | int last; | ||
348 | unsigned long ptr = 0; | ||
349 | |||
350 | WARN_ON(!list_empty(data_refs)); | ||
351 | ret = extent_from_logical(fs_info, logical, path, &key); | ||
352 | if (ret & BTRFS_EXTENT_FLAG_DATA) | ||
353 | ret = -EIO; | ||
354 | if (ret < 0) | ||
355 | goto out; | ||
356 | |||
357 | eb = path->nodes[0]; | ||
358 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
359 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
360 | |||
361 | ret = 0; | ||
362 | ref_root = 0; | ||
363 | /* | ||
364 | * as done in iterate_extent_inodes, we first build a list of refs to | ||
365 | * iterate, then free the path and then iterate them to avoid deadlocks. | ||
366 | */ | ||
367 | do { | ||
368 | last = __get_extent_inline_ref(&ptr, eb, ei, item_size, | ||
369 | &eiref, &type); | ||
370 | if (last < 0) { | ||
371 | ret = last; | ||
372 | goto out; | ||
373 | } | ||
374 | if (type == BTRFS_TREE_BLOCK_REF_KEY || | ||
375 | type == BTRFS_SHARED_BLOCK_REF_KEY) { | ||
376 | ref_root = btrfs_extent_inline_ref_offset(eb, eiref); | ||
377 | ret = __data_list_add(data_refs, inum, | ||
378 | extent_data_item_offset, | ||
379 | ref_root); | ||
380 | } | ||
381 | } while (!ret && !last); | ||
382 | |||
383 | btrfs_release_path(path); | ||
384 | |||
385 | if (ref_root == 0) { | ||
386 | printk(KERN_ERR "btrfs: failed to find tree block ref " | ||
387 | "for shared data backref %llu\n", logical); | ||
388 | WARN_ON(1); | ||
389 | ret = -EIO; | ||
390 | } | ||
391 | |||
392 | out: | ||
393 | while (!list_empty(data_refs)) { | ||
394 | ref = list_first_entry(data_refs, struct __data_ref, list); | ||
395 | list_del(&ref->list); | ||
396 | if (!ret) | ||
397 | ret = iterate(ref->inum, extent_offset + | ||
398 | ref->extent_data_item_offset, | ||
399 | ref->root, ctx); | ||
400 | kfree(ref); | ||
401 | } | ||
402 | |||
403 | return ret; | ||
404 | } | ||
405 | |||
406 | static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info, | ||
407 | u64 logical, u64 orig_extent_item_objectid, | ||
408 | u64 extent_offset, struct btrfs_path *path, | ||
409 | struct list_head *data_refs, | ||
410 | iterate_extent_inodes_t *iterate, | ||
411 | void *ctx) | ||
412 | { | ||
413 | u64 disk_byte; | ||
414 | struct btrfs_key key; | ||
415 | struct btrfs_file_extent_item *fi; | ||
416 | struct extent_buffer *eb; | ||
417 | int slot; | ||
418 | int nritems; | ||
419 | int ret; | ||
420 | int found = 0; | ||
421 | |||
422 | eb = read_tree_block(fs_info->tree_root, logical, | ||
423 | fs_info->tree_root->leafsize, 0); | ||
424 | if (!eb) | ||
425 | return -EIO; | ||
426 | |||
427 | /* | ||
428 | * from the shared data ref, we only have the leaf but we need | ||
429 | * the key. thus, we must look into all items and see that we | ||
430 | * find one (some) with a reference to our extent item. | ||
431 | */ | ||
432 | nritems = btrfs_header_nritems(eb); | ||
433 | for (slot = 0; slot < nritems; ++slot) { | ||
434 | btrfs_item_key_to_cpu(eb, &key, slot); | ||
435 | if (key.type != BTRFS_EXTENT_DATA_KEY) | ||
436 | continue; | ||
437 | fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); | ||
438 | if (!fi) { | ||
439 | free_extent_buffer(eb); | ||
440 | return -EIO; | ||
441 | } | ||
442 | disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); | ||
443 | if (disk_byte != orig_extent_item_objectid) { | ||
444 | if (found) | ||
445 | break; | ||
446 | else | ||
447 | continue; | ||
448 | } | ||
449 | ++found; | ||
450 | ret = __iter_shared_inline_ref_inodes(fs_info, logical, | ||
451 | key.objectid, | ||
452 | key.offset, | ||
453 | extent_offset, path, | ||
454 | data_refs, | ||
455 | iterate, ctx); | ||
456 | if (ret) | ||
457 | break; | ||
458 | } | ||
459 | |||
460 | if (!found) { | ||
461 | printk(KERN_ERR "btrfs: failed to follow shared data backref " | ||
462 | "to parent %llu\n", logical); | ||
463 | WARN_ON(1); | ||
464 | ret = -EIO; | ||
465 | } | ||
466 | |||
467 | free_extent_buffer(eb); | ||
468 | return ret; | ||
469 | } | ||
470 | |||
471 | /* | ||
472 | * calls iterate() for every inode that references the extent identified by | ||
473 | * the given parameters. will use the path given as a parameter and return it | ||
474 | * released. | ||
475 | * when the iterator function returns a non-zero value, iteration stops. | ||
476 | */ | ||
477 | int iterate_extent_inodes(struct btrfs_fs_info *fs_info, | ||
478 | struct btrfs_path *path, | ||
479 | u64 extent_item_objectid, | ||
480 | u64 extent_offset, | ||
481 | iterate_extent_inodes_t *iterate, void *ctx) | ||
482 | { | ||
483 | unsigned long ptr = 0; | ||
484 | int last; | ||
485 | int ret; | ||
486 | int type; | ||
487 | u64 logical; | ||
488 | u32 item_size; | ||
489 | struct btrfs_extent_inline_ref *eiref; | ||
490 | struct btrfs_extent_data_ref *dref; | ||
491 | struct extent_buffer *eb; | ||
492 | struct btrfs_extent_item *ei; | ||
493 | struct btrfs_key key; | ||
494 | struct list_head data_refs = LIST_HEAD_INIT(data_refs); | ||
495 | struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); | ||
496 | struct __data_ref *ref_d; | ||
497 | struct __shared_ref *ref_s; | ||
498 | |||
499 | eb = path->nodes[0]; | ||
500 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
501 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
502 | |||
503 | /* first we iterate the inline refs, ... */ | ||
504 | do { | ||
505 | last = __get_extent_inline_ref(&ptr, eb, ei, item_size, | ||
506 | &eiref, &type); | ||
507 | if (last == -ENOENT) { | ||
508 | ret = 0; | ||
509 | break; | ||
510 | } | ||
511 | if (last < 0) { | ||
512 | ret = last; | ||
513 | break; | ||
514 | } | ||
515 | |||
516 | if (type == BTRFS_EXTENT_DATA_REF_KEY) { | ||
517 | dref = (struct btrfs_extent_data_ref *)(&eiref->offset); | ||
518 | ret = __data_list_add_eb(&data_refs, eb, dref); | ||
519 | } else if (type == BTRFS_SHARED_DATA_REF_KEY) { | ||
520 | logical = btrfs_extent_inline_ref_offset(eb, eiref); | ||
521 | ret = __shared_list_add(&shared_refs, logical); | ||
522 | } | ||
523 | } while (!ret && !last); | ||
524 | |||
525 | /* ... then we proceed to in-tree references and ... */ | ||
526 | while (!ret) { | ||
527 | ++path->slots[0]; | ||
528 | if (path->slots[0] > btrfs_header_nritems(eb)) { | ||
529 | ret = btrfs_next_leaf(fs_info->extent_root, path); | ||
530 | if (ret) { | ||
531 | if (ret == 1) | ||
532 | ret = 0; /* we're done */ | ||
533 | break; | ||
534 | } | ||
535 | eb = path->nodes[0]; | ||
536 | } | ||
537 | btrfs_item_key_to_cpu(eb, &key, path->slots[0]); | ||
538 | if (key.objectid != extent_item_objectid) | ||
539 | break; | ||
540 | if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { | ||
541 | dref = btrfs_item_ptr(eb, path->slots[0], | ||
542 | struct btrfs_extent_data_ref); | ||
543 | ret = __data_list_add_eb(&data_refs, eb, dref); | ||
544 | } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { | ||
545 | ret = __shared_list_add(&shared_refs, key.offset); | ||
546 | } | ||
547 | } | ||
548 | |||
549 | btrfs_release_path(path); | ||
550 | |||
551 | /* | ||
552 | * ... only at the very end we can process the refs we found. this is | ||
553 | * because the iterator function we call is allowed to make tree lookups | ||
554 | * and we have to avoid deadlocks. additionally, we need more tree | ||
555 | * lookups ourselves for shared data refs. | ||
556 | */ | ||
557 | while (!list_empty(&data_refs)) { | ||
558 | ref_d = list_first_entry(&data_refs, struct __data_ref, list); | ||
559 | list_del(&ref_d->list); | ||
560 | if (!ret) | ||
561 | ret = iterate(ref_d->inum, extent_offset + | ||
562 | ref_d->extent_data_item_offset, | ||
563 | ref_d->root, ctx); | ||
564 | kfree(ref_d); | ||
565 | } | ||
566 | |||
567 | while (!list_empty(&shared_refs)) { | ||
568 | ref_s = list_first_entry(&shared_refs, struct __shared_ref, | ||
569 | list); | ||
570 | list_del(&ref_s->list); | ||
571 | if (!ret) | ||
572 | ret = __iter_shared_inline_ref(fs_info, | ||
573 | ref_s->disk_byte, | ||
574 | extent_item_objectid, | ||
575 | extent_offset, path, | ||
576 | &data_refs, | ||
577 | iterate, ctx); | ||
578 | kfree(ref_s); | ||
579 | } | ||
580 | |||
581 | return ret; | ||
582 | } | ||
583 | |||
584 | int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, | ||
585 | struct btrfs_path *path, | ||
586 | iterate_extent_inodes_t *iterate, void *ctx) | ||
587 | { | ||
588 | int ret; | ||
589 | u64 offset; | ||
590 | struct btrfs_key found_key; | ||
591 | |||
592 | ret = extent_from_logical(fs_info, logical, path, | ||
593 | &found_key); | ||
594 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) | ||
595 | ret = -EINVAL; | ||
596 | if (ret < 0) | ||
597 | return ret; | ||
598 | |||
599 | offset = logical - found_key.objectid; | ||
600 | ret = iterate_extent_inodes(fs_info, path, found_key.objectid, | ||
601 | offset, iterate, ctx); | ||
602 | |||
603 | return ret; | ||
604 | } | ||
605 | |||
606 | static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, | ||
607 | struct btrfs_path *path, | ||
608 | iterate_irefs_t *iterate, void *ctx) | ||
609 | { | ||
610 | int ret; | ||
611 | int slot; | ||
612 | u32 cur; | ||
613 | u32 len; | ||
614 | u32 name_len; | ||
615 | u64 parent = 0; | ||
616 | int found = 0; | ||
617 | struct extent_buffer *eb; | ||
618 | struct btrfs_item *item; | ||
619 | struct btrfs_inode_ref *iref; | ||
620 | struct btrfs_key found_key; | ||
621 | |||
622 | while (1) { | ||
623 | ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, | ||
624 | &found_key); | ||
625 | if (ret < 0) | ||
626 | break; | ||
627 | if (ret) { | ||
628 | ret = found ? 0 : -ENOENT; | ||
629 | break; | ||
630 | } | ||
631 | ++found; | ||
632 | |||
633 | parent = found_key.offset; | ||
634 | slot = path->slots[0]; | ||
635 | eb = path->nodes[0]; | ||
636 | /* make sure we can use eb after releasing the path */ | ||
637 | atomic_inc(&eb->refs); | ||
638 | btrfs_release_path(path); | ||
639 | |||
640 | item = btrfs_item_nr(eb, slot); | ||
641 | iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); | ||
642 | |||
643 | for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { | ||
644 | name_len = btrfs_inode_ref_name_len(eb, iref); | ||
645 | /* path must be released before calling iterate()! */ | ||
646 | ret = iterate(parent, iref, eb, ctx); | ||
647 | if (ret) { | ||
648 | free_extent_buffer(eb); | ||
649 | break; | ||
650 | } | ||
651 | len = sizeof(*iref) + name_len; | ||
652 | iref = (struct btrfs_inode_ref *)((char *)iref + len); | ||
653 | } | ||
654 | free_extent_buffer(eb); | ||
655 | } | ||
656 | |||
657 | btrfs_release_path(path); | ||
658 | |||
659 | return ret; | ||
660 | } | ||
661 | |||
662 | /* | ||
663 | * returns 0 if the path could be dumped (probably truncated) | ||
664 | * returns <0 in case of an error | ||
665 | */ | ||
666 | static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, | ||
667 | struct extent_buffer *eb, void *ctx) | ||
668 | { | ||
669 | struct inode_fs_paths *ipath = ctx; | ||
670 | char *fspath; | ||
671 | char *fspath_min; | ||
672 | int i = ipath->fspath->elem_cnt; | ||
673 | const int s_ptr = sizeof(char *); | ||
674 | u32 bytes_left; | ||
675 | |||
676 | bytes_left = ipath->fspath->bytes_left > s_ptr ? | ||
677 | ipath->fspath->bytes_left - s_ptr : 0; | ||
678 | |||
679 | fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr; | ||
680 | fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb, | ||
681 | inum, fspath_min, bytes_left); | ||
682 | if (IS_ERR(fspath)) | ||
683 | return PTR_ERR(fspath); | ||
684 | |||
685 | if (fspath > fspath_min) { | ||
686 | ipath->fspath->val[i] = (u64)(unsigned long)fspath; | ||
687 | ++ipath->fspath->elem_cnt; | ||
688 | ipath->fspath->bytes_left = fspath - fspath_min; | ||
689 | } else { | ||
690 | ++ipath->fspath->elem_missed; | ||
691 | ipath->fspath->bytes_missing += fspath_min - fspath; | ||
692 | ipath->fspath->bytes_left = 0; | ||
693 | } | ||
694 | |||
695 | return 0; | ||
696 | } | ||
697 | |||
698 | /* | ||
699 | * this dumps all file system paths to the inode into the ipath struct, provided | ||
700 | * is has been created large enough. each path is zero-terminated and accessed | ||
701 | * from ipath->fspath->val[i]. | ||
702 | * when it returns, there are ipath->fspath->elem_cnt number of paths available | ||
703 | * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the | ||
704 | * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, | ||
705 | * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would | ||
706 | * have been needed to return all paths. | ||
707 | */ | ||
708 | int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) | ||
709 | { | ||
710 | return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, | ||
711 | inode_to_path, ipath); | ||
712 | } | ||
713 | |||
714 | /* | ||
715 | * allocates space to return multiple file system paths for an inode. | ||
716 | * total_bytes to allocate are passed, note that space usable for actual path | ||
717 | * information will be total_bytes - sizeof(struct inode_fs_paths). | ||
718 | * the returned pointer must be freed with free_ipath() in the end. | ||
719 | */ | ||
720 | struct btrfs_data_container *init_data_container(u32 total_bytes) | ||
721 | { | ||
722 | struct btrfs_data_container *data; | ||
723 | size_t alloc_bytes; | ||
724 | |||
725 | alloc_bytes = max_t(size_t, total_bytes, sizeof(*data)); | ||
726 | data = kmalloc(alloc_bytes, GFP_NOFS); | ||
727 | if (!data) | ||
728 | return ERR_PTR(-ENOMEM); | ||
729 | |||
730 | if (total_bytes >= sizeof(*data)) { | ||
731 | data->bytes_left = total_bytes - sizeof(*data); | ||
732 | data->bytes_missing = 0; | ||
733 | } else { | ||
734 | data->bytes_missing = sizeof(*data) - total_bytes; | ||
735 | data->bytes_left = 0; | ||
736 | } | ||
737 | |||
738 | data->elem_cnt = 0; | ||
739 | data->elem_missed = 0; | ||
740 | |||
741 | return data; | ||
742 | } | ||
743 | |||
744 | /* | ||
745 | * allocates space to return multiple file system paths for an inode. | ||
746 | * total_bytes to allocate are passed, note that space usable for actual path | ||
747 | * information will be total_bytes - sizeof(struct inode_fs_paths). | ||
748 | * the returned pointer must be freed with free_ipath() in the end. | ||
749 | */ | ||
750 | struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, | ||
751 | struct btrfs_path *path) | ||
752 | { | ||
753 | struct inode_fs_paths *ifp; | ||
754 | struct btrfs_data_container *fspath; | ||
755 | |||
756 | fspath = init_data_container(total_bytes); | ||
757 | if (IS_ERR(fspath)) | ||
758 | return (void *)fspath; | ||
759 | |||
760 | ifp = kmalloc(sizeof(*ifp), GFP_NOFS); | ||
761 | if (!ifp) { | ||
762 | kfree(fspath); | ||
763 | return ERR_PTR(-ENOMEM); | ||
764 | } | ||
765 | |||
766 | ifp->btrfs_path = path; | ||
767 | ifp->fspath = fspath; | ||
768 | ifp->fs_root = fs_root; | ||
769 | |||
770 | return ifp; | ||
771 | } | ||
772 | |||
773 | void free_ipath(struct inode_fs_paths *ipath) | ||
774 | { | ||
775 | kfree(ipath); | ||
776 | } | ||
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h new file mode 100644 index 00000000000..92618837cb8 --- /dev/null +++ b/fs/btrfs/backref.h | |||
@@ -0,0 +1,62 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 STRATO. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #ifndef __BTRFS_BACKREF__ | ||
20 | #define __BTRFS_BACKREF__ | ||
21 | |||
22 | #include "ioctl.h" | ||
23 | |||
24 | struct inode_fs_paths { | ||
25 | struct btrfs_path *btrfs_path; | ||
26 | struct btrfs_root *fs_root; | ||
27 | struct btrfs_data_container *fspath; | ||
28 | }; | ||
29 | |||
30 | typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, | ||
31 | void *ctx); | ||
32 | typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref, | ||
33 | struct extent_buffer *eb, void *ctx); | ||
34 | |||
35 | int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, | ||
36 | struct btrfs_path *path); | ||
37 | |||
38 | int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, | ||
39 | struct btrfs_path *path, struct btrfs_key *found_key); | ||
40 | |||
41 | int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, | ||
42 | struct btrfs_extent_item *ei, u32 item_size, | ||
43 | u64 *out_root, u8 *out_level); | ||
44 | |||
45 | int iterate_extent_inodes(struct btrfs_fs_info *fs_info, | ||
46 | struct btrfs_path *path, | ||
47 | u64 extent_item_objectid, | ||
48 | u64 extent_offset, | ||
49 | iterate_extent_inodes_t *iterate, void *ctx); | ||
50 | |||
51 | int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, | ||
52 | struct btrfs_path *path, | ||
53 | iterate_extent_inodes_t *iterate, void *ctx); | ||
54 | |||
55 | int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); | ||
56 | |||
57 | struct btrfs_data_container *init_data_container(u32 total_bytes); | ||
58 | struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, | ||
59 | struct btrfs_path *path); | ||
60 | void free_ipath(struct inode_fs_paths *ipath); | ||
61 | |||
62 | #endif | ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d9f99a16edd..634608d2a6d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
@@ -103,11 +103,6 @@ struct btrfs_inode { | |||
103 | */ | 103 | */ |
104 | u64 delalloc_bytes; | 104 | u64 delalloc_bytes; |
105 | 105 | ||
106 | /* total number of bytes that may be used for this inode for | ||
107 | * delalloc | ||
108 | */ | ||
109 | u64 reserved_bytes; | ||
110 | |||
111 | /* | 106 | /* |
112 | * the size of the file stored in the metadata on disk. data=ordered | 107 | * the size of the file stored in the metadata on disk. data=ordered |
113 | * means the in-memory i_size might be larger than the size on disk | 108 | * means the in-memory i_size might be larger than the size on disk |
@@ -115,9 +110,6 @@ struct btrfs_inode { | |||
115 | */ | 110 | */ |
116 | u64 disk_i_size; | 111 | u64 disk_i_size; |
117 | 112 | ||
118 | /* flags field from the on disk inode */ | ||
119 | u32 flags; | ||
120 | |||
121 | /* | 113 | /* |
122 | * if this is a directory then index_cnt is the counter for the index | 114 | * if this is a directory then index_cnt is the counter for the index |
123 | * number for new files that are created | 115 | * number for new files that are created |
@@ -132,6 +124,15 @@ struct btrfs_inode { | |||
132 | u64 last_unlink_trans; | 124 | u64 last_unlink_trans; |
133 | 125 | ||
134 | /* | 126 | /* |
127 | * Number of bytes outstanding that are going to need csums. This is | ||
128 | * used in ENOSPC accounting. | ||
129 | */ | ||
130 | u64 csum_bytes; | ||
131 | |||
132 | /* flags field from the on disk inode */ | ||
133 | u32 flags; | ||
134 | |||
135 | /* | ||
135 | * Counters to keep track of the number of extent item's we may use due | 136 | * Counters to keep track of the number of extent item's we may use due |
136 | * to delalloc and such. outstanding_extents is the number of extent | 137 | * to delalloc and such. outstanding_extents is the number of extent |
137 | * items we think we'll end up using, and reserved_extents is the number | 138 | * items we think we'll end up using, and reserved_extents is the number |
@@ -146,14 +147,12 @@ struct btrfs_inode { | |||
146 | * the btrfs file release call will add this inode to the | 147 | * the btrfs file release call will add this inode to the |
147 | * ordered operations list so that we make sure to flush out any | 148 | * ordered operations list so that we make sure to flush out any |
148 | * new data the application may have written before commit. | 149 | * new data the application may have written before commit. |
149 | * | ||
150 | * yes, its silly to have a single bitflag, but we might grow more | ||
151 | * of these. | ||
152 | */ | 150 | */ |
153 | unsigned ordered_data_close:1; | 151 | unsigned ordered_data_close:1; |
154 | unsigned orphan_meta_reserved:1; | 152 | unsigned orphan_meta_reserved:1; |
155 | unsigned dummy_inode:1; | 153 | unsigned dummy_inode:1; |
156 | unsigned in_defrag:1; | 154 | unsigned in_defrag:1; |
155 | unsigned delalloc_meta_reserved:1; | ||
157 | 156 | ||
158 | /* | 157 | /* |
159 | * always compress this one file | 158 | * always compress this one file |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8ec5d86f173..14f1c5a0b2d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -85,7 +85,8 @@ struct compressed_bio { | |||
85 | static inline int compressed_bio_size(struct btrfs_root *root, | 85 | static inline int compressed_bio_size(struct btrfs_root *root, |
86 | unsigned long disk_size) | 86 | unsigned long disk_size) |
87 | { | 87 | { |
88 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); | 88 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
89 | |||
89 | return sizeof(struct compressed_bio) + | 90 | return sizeof(struct compressed_bio) + |
90 | ((disk_size + root->sectorsize - 1) / root->sectorsize) * | 91 | ((disk_size + root->sectorsize - 1) / root->sectorsize) * |
91 | csum_size; | 92 | csum_size; |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 011cab3aca8..dede441bdee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, | |||
514 | struct btrfs_root *root, | 514 | struct btrfs_root *root, |
515 | struct extent_buffer *buf) | 515 | struct extent_buffer *buf) |
516 | { | 516 | { |
517 | /* ensure we can see the force_cow */ | ||
518 | smp_rmb(); | ||
519 | |||
520 | /* | ||
521 | * We do not need to cow a block if | ||
522 | * 1) this block is not created or changed in this transaction; | ||
523 | * 2) this block does not belong to TREE_RELOC tree; | ||
524 | * 3) the root is not forced COW. | ||
525 | * | ||
526 | * What is forced COW: | ||
527 | * when we create snapshot during commiting the transaction, | ||
528 | * after we've finished coping src root, we must COW the shared | ||
529 | * block to ensure the metadata consistency. | ||
530 | */ | ||
517 | if (btrfs_header_generation(buf) == trans->transid && | 531 | if (btrfs_header_generation(buf) == trans->transid && |
518 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && | 532 | !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && |
519 | !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && | 533 | !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && |
520 | btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) | 534 | btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && |
535 | !root->force_cow) | ||
521 | return 0; | 536 | return 0; |
522 | return 1; | 537 | return 1; |
523 | } | 538 | } |
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, | |||
902 | 917 | ||
903 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); | 918 | orig_ptr = btrfs_node_blockptr(mid, orig_slot); |
904 | 919 | ||
905 | if (level < BTRFS_MAX_LEVEL - 1) | 920 | if (level < BTRFS_MAX_LEVEL - 1) { |
906 | parent = path->nodes[level + 1]; | 921 | parent = path->nodes[level + 1]; |
907 | pslot = path->slots[level + 1]; | 922 | pslot = path->slots[level + 1]; |
923 | } | ||
908 | 924 | ||
909 | /* | 925 | /* |
910 | * deal with the case where there is only one pointer in the root | 926 | * deal with the case where there is only one pointer in the root |
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, | |||
1107 | mid = path->nodes[level]; | 1123 | mid = path->nodes[level]; |
1108 | WARN_ON(btrfs_header_generation(mid) != trans->transid); | 1124 | WARN_ON(btrfs_header_generation(mid) != trans->transid); |
1109 | 1125 | ||
1110 | if (level < BTRFS_MAX_LEVEL - 1) | 1126 | if (level < BTRFS_MAX_LEVEL - 1) { |
1111 | parent = path->nodes[level + 1]; | 1127 | parent = path->nodes[level + 1]; |
1112 | pslot = path->slots[level + 1]; | 1128 | pslot = path->slots[level + 1]; |
1129 | } | ||
1113 | 1130 | ||
1114 | if (!parent) | 1131 | if (!parent) |
1115 | return 1; | 1132 | return 1; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03912c5c6f4..50634abef9b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/kobject.h> | 30 | #include <linux/kobject.h> |
31 | #include <trace/events/btrfs.h> | 31 | #include <trace/events/btrfs.h> |
32 | #include <asm/kmap_types.h> | 32 | #include <asm/kmap_types.h> |
33 | #include <linux/pagemap.h> | ||
33 | #include "extent_io.h" | 34 | #include "extent_io.h" |
34 | #include "extent_map.h" | 35 | #include "extent_map.h" |
35 | #include "async-thread.h" | 36 | #include "async-thread.h" |
@@ -360,6 +361,47 @@ struct btrfs_header { | |||
360 | #define BTRFS_LABEL_SIZE 256 | 361 | #define BTRFS_LABEL_SIZE 256 |
361 | 362 | ||
362 | /* | 363 | /* |
364 | * just in case we somehow lose the roots and are not able to mount, | ||
365 | * we store an array of the roots from previous transactions | ||
366 | * in the super. | ||
367 | */ | ||
368 | #define BTRFS_NUM_BACKUP_ROOTS 4 | ||
369 | struct btrfs_root_backup { | ||
370 | __le64 tree_root; | ||
371 | __le64 tree_root_gen; | ||
372 | |||
373 | __le64 chunk_root; | ||
374 | __le64 chunk_root_gen; | ||
375 | |||
376 | __le64 extent_root; | ||
377 | __le64 extent_root_gen; | ||
378 | |||
379 | __le64 fs_root; | ||
380 | __le64 fs_root_gen; | ||
381 | |||
382 | __le64 dev_root; | ||
383 | __le64 dev_root_gen; | ||
384 | |||
385 | __le64 csum_root; | ||
386 | __le64 csum_root_gen; | ||
387 | |||
388 | __le64 total_bytes; | ||
389 | __le64 bytes_used; | ||
390 | __le64 num_devices; | ||
391 | /* future */ | ||
392 | __le64 unsed_64[4]; | ||
393 | |||
394 | u8 tree_root_level; | ||
395 | u8 chunk_root_level; | ||
396 | u8 extent_root_level; | ||
397 | u8 fs_root_level; | ||
398 | u8 dev_root_level; | ||
399 | u8 csum_root_level; | ||
400 | /* future and to align */ | ||
401 | u8 unused_8[10]; | ||
402 | } __attribute__ ((__packed__)); | ||
403 | |||
404 | /* | ||
363 | * the super block basically lists the main trees of the FS | 405 | * the super block basically lists the main trees of the FS |
364 | * it currently lacks any block count etc etc | 406 | * it currently lacks any block count etc etc |
365 | */ | 407 | */ |
@@ -405,6 +447,7 @@ struct btrfs_super_block { | |||
405 | /* future expansion */ | 447 | /* future expansion */ |
406 | __le64 reserved[31]; | 448 | __le64 reserved[31]; |
407 | u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; | 449 | u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; |
450 | struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; | ||
408 | } __attribute__ ((__packed__)); | 451 | } __attribute__ ((__packed__)); |
409 | 452 | ||
410 | /* | 453 | /* |
@@ -772,14 +815,8 @@ struct btrfs_space_info { | |||
772 | struct btrfs_block_rsv { | 815 | struct btrfs_block_rsv { |
773 | u64 size; | 816 | u64 size; |
774 | u64 reserved; | 817 | u64 reserved; |
775 | u64 freed[2]; | ||
776 | struct btrfs_space_info *space_info; | 818 | struct btrfs_space_info *space_info; |
777 | struct list_head list; | ||
778 | spinlock_t lock; | 819 | spinlock_t lock; |
779 | atomic_t usage; | ||
780 | unsigned int priority:8; | ||
781 | unsigned int durable:1; | ||
782 | unsigned int refill_used:1; | ||
783 | unsigned int full:1; | 820 | unsigned int full:1; |
784 | }; | 821 | }; |
785 | 822 | ||
@@ -811,7 +848,8 @@ struct btrfs_free_cluster { | |||
811 | enum btrfs_caching_type { | 848 | enum btrfs_caching_type { |
812 | BTRFS_CACHE_NO = 0, | 849 | BTRFS_CACHE_NO = 0, |
813 | BTRFS_CACHE_STARTED = 1, | 850 | BTRFS_CACHE_STARTED = 1, |
814 | BTRFS_CACHE_FINISHED = 2, | 851 | BTRFS_CACHE_FAST = 2, |
852 | BTRFS_CACHE_FINISHED = 3, | ||
815 | }; | 853 | }; |
816 | 854 | ||
817 | enum btrfs_disk_cache_state { | 855 | enum btrfs_disk_cache_state { |
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache { | |||
840 | spinlock_t lock; | 878 | spinlock_t lock; |
841 | u64 pinned; | 879 | u64 pinned; |
842 | u64 reserved; | 880 | u64 reserved; |
843 | u64 reserved_pinned; | ||
844 | u64 bytes_super; | 881 | u64 bytes_super; |
845 | u64 flags; | 882 | u64 flags; |
846 | u64 sectorsize; | 883 | u64 sectorsize; |
884 | u64 cache_generation; | ||
847 | unsigned int ro:1; | 885 | unsigned int ro:1; |
848 | unsigned int dirty:1; | 886 | unsigned int dirty:1; |
849 | unsigned int iref:1; | 887 | unsigned int iref:1; |
@@ -899,6 +937,10 @@ struct btrfs_fs_info { | |||
899 | spinlock_t block_group_cache_lock; | 937 | spinlock_t block_group_cache_lock; |
900 | struct rb_root block_group_cache_tree; | 938 | struct rb_root block_group_cache_tree; |
901 | 939 | ||
940 | /* keep track of unallocated space */ | ||
941 | spinlock_t free_chunk_lock; | ||
942 | u64 free_chunk_space; | ||
943 | |||
902 | struct extent_io_tree freed_extents[2]; | 944 | struct extent_io_tree freed_extents[2]; |
903 | struct extent_io_tree *pinned_extents; | 945 | struct extent_io_tree *pinned_extents; |
904 | 946 | ||
@@ -916,14 +958,11 @@ struct btrfs_fs_info { | |||
916 | struct btrfs_block_rsv trans_block_rsv; | 958 | struct btrfs_block_rsv trans_block_rsv; |
917 | /* block reservation for chunk tree */ | 959 | /* block reservation for chunk tree */ |
918 | struct btrfs_block_rsv chunk_block_rsv; | 960 | struct btrfs_block_rsv chunk_block_rsv; |
961 | /* block reservation for delayed operations */ | ||
962 | struct btrfs_block_rsv delayed_block_rsv; | ||
919 | 963 | ||
920 | struct btrfs_block_rsv empty_block_rsv; | 964 | struct btrfs_block_rsv empty_block_rsv; |
921 | 965 | ||
922 | /* list of block reservations that cross multiple transactions */ | ||
923 | struct list_head durable_block_rsv_list; | ||
924 | |||
925 | struct mutex durable_block_rsv_mutex; | ||
926 | |||
927 | u64 generation; | 966 | u64 generation; |
928 | u64 last_trans_committed; | 967 | u64 last_trans_committed; |
929 | 968 | ||
@@ -942,8 +981,8 @@ struct btrfs_fs_info { | |||
942 | wait_queue_head_t transaction_blocked_wait; | 981 | wait_queue_head_t transaction_blocked_wait; |
943 | wait_queue_head_t async_submit_wait; | 982 | wait_queue_head_t async_submit_wait; |
944 | 983 | ||
945 | struct btrfs_super_block super_copy; | 984 | struct btrfs_super_block *super_copy; |
946 | struct btrfs_super_block super_for_commit; | 985 | struct btrfs_super_block *super_for_commit; |
947 | struct block_device *__bdev; | 986 | struct block_device *__bdev; |
948 | struct super_block *sb; | 987 | struct super_block *sb; |
949 | struct inode *btree_inode; | 988 | struct inode *btree_inode; |
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info { | |||
1036 | struct btrfs_workers endio_freespace_worker; | 1075 | struct btrfs_workers endio_freespace_worker; |
1037 | struct btrfs_workers submit_workers; | 1076 | struct btrfs_workers submit_workers; |
1038 | struct btrfs_workers caching_workers; | 1077 | struct btrfs_workers caching_workers; |
1078 | struct btrfs_workers readahead_workers; | ||
1039 | 1079 | ||
1040 | /* | 1080 | /* |
1041 | * fixup workers take dirty pages that didn't properly go through | 1081 | * fixup workers take dirty pages that didn't properly go through |
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info { | |||
1119 | u64 fs_state; | 1159 | u64 fs_state; |
1120 | 1160 | ||
1121 | struct btrfs_delayed_root *delayed_root; | 1161 | struct btrfs_delayed_root *delayed_root; |
1162 | |||
1163 | /* readahead tree */ | ||
1164 | spinlock_t reada_lock; | ||
1165 | struct radix_tree_root reada_tree; | ||
1166 | |||
1167 | /* next backup root to be overwritten */ | ||
1168 | int backup_root_index; | ||
1122 | }; | 1169 | }; |
1123 | 1170 | ||
1124 | /* | 1171 | /* |
@@ -1225,6 +1272,8 @@ struct btrfs_root { | |||
1225 | * for stat. It may be used for more later | 1272 | * for stat. It may be used for more later |
1226 | */ | 1273 | */ |
1227 | dev_t anon_dev; | 1274 | dev_t anon_dev; |
1275 | |||
1276 | int force_cow; | ||
1228 | }; | 1277 | }; |
1229 | 1278 | ||
1230 | struct btrfs_ioctl_defrag_range_args { | 1279 | struct btrfs_ioctl_defrag_range_args { |
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args { | |||
1363 | #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) | 1412 | #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) |
1364 | #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) | 1413 | #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) |
1365 | #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) | 1414 | #define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) |
1415 | #define BTRFS_MOUNT_RECOVERY (1 << 18) | ||
1366 | 1416 | ||
1367 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) | 1417 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
1368 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | 1418 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root) | |||
1978 | return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; | 2028 | return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; |
1979 | } | 2029 | } |
1980 | 2030 | ||
2031 | /* struct btrfs_root_backup */ | ||
2032 | BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, | ||
2033 | tree_root, 64); | ||
2034 | BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, | ||
2035 | tree_root_gen, 64); | ||
2036 | BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, | ||
2037 | tree_root_level, 8); | ||
2038 | |||
2039 | BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, | ||
2040 | chunk_root, 64); | ||
2041 | BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, | ||
2042 | chunk_root_gen, 64); | ||
2043 | BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, | ||
2044 | chunk_root_level, 8); | ||
2045 | |||
2046 | BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, | ||
2047 | extent_root, 64); | ||
2048 | BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, | ||
2049 | extent_root_gen, 64); | ||
2050 | BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, | ||
2051 | extent_root_level, 8); | ||
2052 | |||
2053 | BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, | ||
2054 | fs_root, 64); | ||
2055 | BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, | ||
2056 | fs_root_gen, 64); | ||
2057 | BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, | ||
2058 | fs_root_level, 8); | ||
2059 | |||
2060 | BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, | ||
2061 | dev_root, 64); | ||
2062 | BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, | ||
2063 | dev_root_gen, 64); | ||
2064 | BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, | ||
2065 | dev_root_level, 8); | ||
2066 | |||
2067 | BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, | ||
2068 | csum_root, 64); | ||
2069 | BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, | ||
2070 | csum_root_gen, 64); | ||
2071 | BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, | ||
2072 | csum_root_level, 8); | ||
2073 | BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, | ||
2074 | total_bytes, 64); | ||
2075 | BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, | ||
2076 | bytes_used, 64); | ||
2077 | BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, | ||
2078 | num_devices, 64); | ||
2079 | |||
1981 | /* struct btrfs_super_block */ | 2080 | /* struct btrfs_super_block */ |
1982 | 2081 | ||
1983 | BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); | 2082 | BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); |
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) | |||
2129 | (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); | 2228 | (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); |
2130 | } | 2229 | } |
2131 | 2230 | ||
2231 | static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) | ||
2232 | { | ||
2233 | return mapping_gfp_mask(mapping) & ~__GFP_FS; | ||
2234 | } | ||
2235 | |||
2132 | /* extent-tree.c */ | 2236 | /* extent-tree.c */ |
2133 | static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, | 2237 | static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, |
2134 | unsigned num_items) | 2238 | unsigned num_items) |
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, | |||
2137 | 3 * num_items; | 2241 | 3 * num_items; |
2138 | } | 2242 | } |
2139 | 2243 | ||
2244 | /* | ||
2245 | * Doing a truncate won't result in new nodes or leaves, just what we need for | ||
2246 | * COW. | ||
2247 | */ | ||
2248 | static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, | ||
2249 | unsigned num_items) | ||
2250 | { | ||
2251 | return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * | ||
2252 | num_items; | ||
2253 | } | ||
2254 | |||
2140 | void btrfs_put_block_group(struct btrfs_block_group_cache *cache); | 2255 | void btrfs_put_block_group(struct btrfs_block_group_cache *cache); |
2141 | int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | 2256 | int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, |
2142 | struct btrfs_root *root, unsigned long count); | 2257 | struct btrfs_root *root, unsigned long count); |
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, | |||
2146 | u64 num_bytes, u64 *refs, u64 *flags); | 2261 | u64 num_bytes, u64 *refs, u64 *flags); |
2147 | int btrfs_pin_extent(struct btrfs_root *root, | 2262 | int btrfs_pin_extent(struct btrfs_root *root, |
2148 | u64 bytenr, u64 num, int reserved); | 2263 | u64 bytenr, u64 num, int reserved); |
2264 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | ||
2265 | struct btrfs_root *root, | ||
2266 | u64 bytenr, u64 num_bytes); | ||
2149 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, | 2267 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, |
2150 | struct btrfs_root *root, | 2268 | struct btrfs_root *root, |
2151 | u64 objectid, u64 offset, u64 bytenr); | 2269 | u64 objectid, u64 offset, u64 bytenr); |
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
2196 | u64 root_objectid, u64 owner, u64 offset); | 2314 | u64 root_objectid, u64 owner, u64 offset); |
2197 | 2315 | ||
2198 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); | 2316 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); |
2199 | int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | 2317 | int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, |
2200 | u64 num_bytes, int reserve, int sinfo); | 2318 | u64 start, u64 len); |
2201 | int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, | 2319 | int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, |
2202 | struct btrfs_root *root); | 2320 | struct btrfs_root *root); |
2203 | int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | 2321 | int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, |
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); | |||
2240 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); | 2358 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); |
2241 | void btrfs_free_block_rsv(struct btrfs_root *root, | 2359 | void btrfs_free_block_rsv(struct btrfs_root *root, |
2242 | struct btrfs_block_rsv *rsv); | 2360 | struct btrfs_block_rsv *rsv); |
2243 | void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, | 2361 | int btrfs_block_rsv_add(struct btrfs_root *root, |
2244 | struct btrfs_block_rsv *rsv); | ||
2245 | int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, | ||
2246 | struct btrfs_root *root, | ||
2247 | struct btrfs_block_rsv *block_rsv, | 2362 | struct btrfs_block_rsv *block_rsv, |
2248 | u64 num_bytes); | 2363 | u64 num_bytes); |
2249 | int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, | 2364 | int btrfs_block_rsv_add_noflush(struct btrfs_root *root, |
2250 | struct btrfs_root *root, | 2365 | struct btrfs_block_rsv *block_rsv, |
2366 | u64 num_bytes); | ||
2367 | int btrfs_block_rsv_check(struct btrfs_root *root, | ||
2368 | struct btrfs_block_rsv *block_rsv, int min_factor); | ||
2369 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
2251 | struct btrfs_block_rsv *block_rsv, | 2370 | struct btrfs_block_rsv *block_rsv, |
2252 | u64 min_reserved, int min_factor); | 2371 | u64 min_reserved); |
2372 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
2373 | struct btrfs_block_rsv *block_rsv, | ||
2374 | u64 min_reserved); | ||
2253 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 2375 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
2254 | struct btrfs_block_rsv *dst_rsv, | 2376 | struct btrfs_block_rsv *dst_rsv, |
2255 | u64 num_bytes); | 2377 | u64 num_bytes); |
2256 | void btrfs_block_rsv_release(struct btrfs_root *root, | 2378 | void btrfs_block_rsv_release(struct btrfs_root *root, |
2257 | struct btrfs_block_rsv *block_rsv, | 2379 | struct btrfs_block_rsv *block_rsv, |
2258 | u64 num_bytes); | 2380 | u64 num_bytes); |
2259 | int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, | ||
2260 | struct btrfs_root *root, | ||
2261 | struct btrfs_block_rsv *rsv); | ||
2262 | int btrfs_set_block_group_ro(struct btrfs_root *root, | 2381 | int btrfs_set_block_group_ro(struct btrfs_root *root, |
2263 | struct btrfs_block_group_cache *cache); | 2382 | struct btrfs_block_group_cache *cache); |
2264 | int btrfs_set_block_group_rw(struct btrfs_root *root, | 2383 | int btrfs_set_block_group_rw(struct btrfs_root *root, |
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) | |||
2379 | smp_mb(); | 2498 | smp_mb(); |
2380 | return fs_info->closing; | 2499 | return fs_info->closing; |
2381 | } | 2500 | } |
2501 | static inline void free_fs_info(struct btrfs_fs_info *fs_info) | ||
2502 | { | ||
2503 | kfree(fs_info->delayed_root); | ||
2504 | kfree(fs_info->extent_root); | ||
2505 | kfree(fs_info->tree_root); | ||
2506 | kfree(fs_info->chunk_root); | ||
2507 | kfree(fs_info->dev_root); | ||
2508 | kfree(fs_info->csum_root); | ||
2509 | kfree(fs_info->super_copy); | ||
2510 | kfree(fs_info->super_for_commit); | ||
2511 | kfree(fs_info); | ||
2512 | } | ||
2382 | 2513 | ||
2383 | /* root-item.c */ | 2514 | /* root-item.c */ |
2384 | int btrfs_find_root_ref(struct btrfs_root *tree_root, | 2515 | int btrfs_find_root_ref(struct btrfs_root *tree_root, |
@@ -2579,11 +2710,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, | |||
2579 | int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); | 2710 | int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); |
2580 | int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); | 2711 | int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); |
2581 | int btrfs_orphan_cleanup(struct btrfs_root *root); | 2712 | int btrfs_orphan_cleanup(struct btrfs_root *root); |
2582 | void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, | ||
2583 | struct btrfs_pending_snapshot *pending, | ||
2584 | u64 *bytes_to_reserve); | ||
2585 | void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, | ||
2586 | struct btrfs_pending_snapshot *pending); | ||
2587 | void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, | 2713 | void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, |
2588 | struct btrfs_root *root); | 2714 | struct btrfs_root *root); |
2589 | int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); | 2715 | int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); |
@@ -2697,4 +2823,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid); | |||
2697 | int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, | 2823 | int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, |
2698 | struct btrfs_scrub_progress *progress); | 2824 | struct btrfs_scrub_progress *progress); |
2699 | 2825 | ||
2826 | /* reada.c */ | ||
2827 | struct reada_control { | ||
2828 | struct btrfs_root *root; /* tree to prefetch */ | ||
2829 | struct btrfs_key key_start; | ||
2830 | struct btrfs_key key_end; /* exclusive */ | ||
2831 | atomic_t elems; | ||
2832 | struct kref refcnt; | ||
2833 | wait_queue_head_t wait; | ||
2834 | }; | ||
2835 | struct reada_control *btrfs_reada_add(struct btrfs_root *root, | ||
2836 | struct btrfs_key *start, struct btrfs_key *end); | ||
2837 | int btrfs_reada_wait(void *handle); | ||
2838 | void btrfs_reada_detach(void *handle); | ||
2839 | int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, | ||
2840 | u64 start, int err); | ||
2841 | |||
2700 | #endif | 2842 | #endif |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ae4d9cd1096..5b163572e0c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, | |||
591 | return 0; | 591 | return 0; |
592 | 592 | ||
593 | src_rsv = trans->block_rsv; | 593 | src_rsv = trans->block_rsv; |
594 | dst_rsv = &root->fs_info->global_block_rsv; | 594 | dst_rsv = &root->fs_info->delayed_block_rsv; |
595 | 595 | ||
596 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); | 596 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); |
597 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); | 597 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); |
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, | |||
609 | if (!item->bytes_reserved) | 609 | if (!item->bytes_reserved) |
610 | return; | 610 | return; |
611 | 611 | ||
612 | rsv = &root->fs_info->global_block_rsv; | 612 | rsv = &root->fs_info->delayed_block_rsv; |
613 | btrfs_block_rsv_release(root, rsv, | 613 | btrfs_block_rsv_release(root, rsv, |
614 | item->bytes_reserved); | 614 | item->bytes_reserved); |
615 | } | 615 | } |
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, | |||
617 | static int btrfs_delayed_inode_reserve_metadata( | 617 | static int btrfs_delayed_inode_reserve_metadata( |
618 | struct btrfs_trans_handle *trans, | 618 | struct btrfs_trans_handle *trans, |
619 | struct btrfs_root *root, | 619 | struct btrfs_root *root, |
620 | struct inode *inode, | ||
620 | struct btrfs_delayed_node *node) | 621 | struct btrfs_delayed_node *node) |
621 | { | 622 | { |
622 | struct btrfs_block_rsv *src_rsv; | 623 | struct btrfs_block_rsv *src_rsv; |
623 | struct btrfs_block_rsv *dst_rsv; | 624 | struct btrfs_block_rsv *dst_rsv; |
624 | u64 num_bytes; | 625 | u64 num_bytes; |
625 | int ret; | 626 | int ret; |
626 | 627 | int release = false; | |
627 | if (!trans->bytes_reserved) | ||
628 | return 0; | ||
629 | 628 | ||
630 | src_rsv = trans->block_rsv; | 629 | src_rsv = trans->block_rsv; |
631 | dst_rsv = &root->fs_info->global_block_rsv; | 630 | dst_rsv = &root->fs_info->delayed_block_rsv; |
632 | 631 | ||
633 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); | 632 | num_bytes = btrfs_calc_trans_metadata_size(root, 1); |
633 | |||
634 | /* | ||
635 | * btrfs_dirty_inode will update the inode under btrfs_join_transaction | ||
636 | * which doesn't reserve space for speed. This is a problem since we | ||
637 | * still need to reserve space for this update, so try to reserve the | ||
638 | * space. | ||
639 | * | ||
640 | * Now if src_rsv == delalloc_block_rsv we'll let it just steal since | ||
641 | * we're accounted for. | ||
642 | */ | ||
643 | if (!trans->bytes_reserved && | ||
644 | src_rsv != &root->fs_info->delalloc_block_rsv) { | ||
645 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); | ||
646 | /* | ||
647 | * Since we're under a transaction reserve_metadata_bytes could | ||
648 | * try to commit the transaction which will make it return | ||
649 | * EAGAIN to make us stop the transaction we have, so return | ||
650 | * ENOSPC instead so that btrfs_dirty_inode knows what to do. | ||
651 | */ | ||
652 | if (ret == -EAGAIN) | ||
653 | ret = -ENOSPC; | ||
654 | if (!ret) | ||
655 | node->bytes_reserved = num_bytes; | ||
656 | return ret; | ||
657 | } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { | ||
658 | spin_lock(&BTRFS_I(inode)->lock); | ||
659 | if (BTRFS_I(inode)->delalloc_meta_reserved) { | ||
660 | BTRFS_I(inode)->delalloc_meta_reserved = 0; | ||
661 | spin_unlock(&BTRFS_I(inode)->lock); | ||
662 | release = true; | ||
663 | goto migrate; | ||
664 | } | ||
665 | spin_unlock(&BTRFS_I(inode)->lock); | ||
666 | |||
667 | /* Ok we didn't have space pre-reserved. This shouldn't happen | ||
668 | * too often but it can happen if we do delalloc to an existing | ||
669 | * inode which gets dirtied because of the time update, and then | ||
670 | * isn't touched again until after the transaction commits and | ||
671 | * then we try to write out the data. First try to be nice and | ||
672 | * reserve something strictly for us. If not be a pain and try | ||
673 | * to steal from the delalloc block rsv. | ||
674 | */ | ||
675 | ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); | ||
676 | if (!ret) | ||
677 | goto out; | ||
678 | |||
679 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); | ||
680 | if (!ret) | ||
681 | goto out; | ||
682 | |||
683 | /* | ||
684 | * Ok this is a problem, let's just steal from the global rsv | ||
685 | * since this really shouldn't happen that often. | ||
686 | */ | ||
687 | WARN_ON(1); | ||
688 | ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, | ||
689 | dst_rsv, num_bytes); | ||
690 | goto out; | ||
691 | } | ||
692 | |||
693 | migrate: | ||
634 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); | 694 | ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); |
695 | |||
696 | out: | ||
697 | /* | ||
698 | * Migrate only takes a reservation, it doesn't touch the size of the | ||
699 | * block_rsv. This is to simplify people who don't normally have things | ||
700 | * migrated from their block rsv. If they go to release their | ||
701 | * reservation, that will decrease the size as well, so if migrate | ||
702 | * reduced size we'd end up with a negative size. But for the | ||
703 | * delalloc_meta_reserved stuff we will only know to drop 1 reservation, | ||
704 | * but we could in fact do this reserve/migrate dance several times | ||
705 | * between the time we did the original reservation and we'd clean it | ||
706 | * up. So to take care of this, release the space for the meta | ||
707 | * reservation here. I think it may be time for a documentation page on | ||
708 | * how block rsvs. work. | ||
709 | */ | ||
635 | if (!ret) | 710 | if (!ret) |
636 | node->bytes_reserved = num_bytes; | 711 | node->bytes_reserved = num_bytes; |
637 | 712 | ||
713 | if (release) | ||
714 | btrfs_block_rsv_release(root, src_rsv, num_bytes); | ||
715 | |||
638 | return ret; | 716 | return ret; |
639 | } | 717 | } |
640 | 718 | ||
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, | |||
646 | if (!node->bytes_reserved) | 724 | if (!node->bytes_reserved) |
647 | return; | 725 | return; |
648 | 726 | ||
649 | rsv = &root->fs_info->global_block_rsv; | 727 | rsv = &root->fs_info->delayed_block_rsv; |
650 | btrfs_block_rsv_release(root, rsv, | 728 | btrfs_block_rsv_release(root, rsv, |
651 | node->bytes_reserved); | 729 | node->bytes_reserved); |
652 | node->bytes_reserved = 0; | 730 | node->bytes_reserved = 0; |
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, | |||
1026 | path->leave_spinning = 1; | 1104 | path->leave_spinning = 1; |
1027 | 1105 | ||
1028 | block_rsv = trans->block_rsv; | 1106 | block_rsv = trans->block_rsv; |
1029 | trans->block_rsv = &root->fs_info->global_block_rsv; | 1107 | trans->block_rsv = &root->fs_info->delayed_block_rsv; |
1030 | 1108 | ||
1031 | delayed_root = btrfs_get_delayed_root(root); | 1109 | delayed_root = btrfs_get_delayed_root(root); |
1032 | 1110 | ||
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | |||
1069 | path->leave_spinning = 1; | 1147 | path->leave_spinning = 1; |
1070 | 1148 | ||
1071 | block_rsv = trans->block_rsv; | 1149 | block_rsv = trans->block_rsv; |
1072 | trans->block_rsv = &node->root->fs_info->global_block_rsv; | 1150 | trans->block_rsv = &node->root->fs_info->delayed_block_rsv; |
1073 | 1151 | ||
1074 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); | 1152 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); |
1075 | if (!ret) | 1153 | if (!ret) |
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
1149 | goto free_path; | 1227 | goto free_path; |
1150 | 1228 | ||
1151 | block_rsv = trans->block_rsv; | 1229 | block_rsv = trans->block_rsv; |
1152 | trans->block_rsv = &root->fs_info->global_block_rsv; | 1230 | trans->block_rsv = &root->fs_info->delayed_block_rsv; |
1153 | 1231 | ||
1154 | ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); | 1232 | ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); |
1155 | if (!ret) | 1233 | if (!ret) |
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, | |||
1685 | goto release_node; | 1763 | goto release_node; |
1686 | } | 1764 | } |
1687 | 1765 | ||
1688 | ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); | 1766 | ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, |
1689 | /* | 1767 | delayed_node); |
1690 | * we must reserve enough space when we start a new transaction, | 1768 | if (ret) |
1691 | * so reserving metadata failure is impossible | 1769 | goto release_node; |
1692 | */ | ||
1693 | BUG_ON(ret); | ||
1694 | 1770 | ||
1695 | fill_stack_inode_item(trans, &delayed_node->inode_item, inode); | 1771 | fill_stack_inode_item(trans, &delayed_node->inode_item, inode); |
1696 | delayed_node->inode_dirty = 1; | 1772 | delayed_node->inode_dirty = 1; |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 07ea91879a9..632f8f3cc9d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result) | |||
256 | static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, | 256 | static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, |
257 | int verify) | 257 | int verify) |
258 | { | 258 | { |
259 | u16 csum_size = | 259 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
260 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
261 | char *result = NULL; | 260 | char *result = NULL; |
262 | unsigned long len; | 261 | unsigned long len; |
263 | unsigned long cur_len; | 262 | unsigned long cur_len; |
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, | |||
367 | clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); | 366 | clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); |
368 | io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; | 367 | io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; |
369 | while (1) { | 368 | while (1) { |
370 | ret = read_extent_buffer_pages(io_tree, eb, start, 1, | 369 | ret = read_extent_buffer_pages(io_tree, eb, start, |
370 | WAIT_COMPLETE, | ||
371 | btree_get_extent, mirror_num); | 371 | btree_get_extent, mirror_num); |
372 | if (!ret && | 372 | if (!ret && |
373 | !verify_parent_transid(io_tree, eb, parent_transid)) | 373 | !verify_parent_transid(io_tree, eb, parent_transid)) |
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, | |||
608 | end = min_t(u64, eb->len, PAGE_CACHE_SIZE); | 608 | end = min_t(u64, eb->len, PAGE_CACHE_SIZE); |
609 | end = eb->start + end - 1; | 609 | end = eb->start + end - 1; |
610 | err: | 610 | err: |
611 | if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { | ||
612 | clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); | ||
613 | btree_readahead_hook(root, eb, eb->start, ret); | ||
614 | } | ||
615 | |||
611 | free_extent_buffer(eb); | 616 | free_extent_buffer(eb); |
612 | out: | 617 | out: |
613 | return ret; | 618 | return ret; |
614 | } | 619 | } |
615 | 620 | ||
621 | static int btree_io_failed_hook(struct bio *failed_bio, | ||
622 | struct page *page, u64 start, u64 end, | ||
623 | int mirror_num, struct extent_state *state) | ||
624 | { | ||
625 | struct extent_io_tree *tree; | ||
626 | unsigned long len; | ||
627 | struct extent_buffer *eb; | ||
628 | struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; | ||
629 | |||
630 | tree = &BTRFS_I(page->mapping->host)->io_tree; | ||
631 | if (page->private == EXTENT_PAGE_PRIVATE) | ||
632 | goto out; | ||
633 | if (!page->private) | ||
634 | goto out; | ||
635 | |||
636 | len = page->private >> 2; | ||
637 | WARN_ON(len == 0); | ||
638 | |||
639 | eb = alloc_extent_buffer(tree, start, len, page); | ||
640 | if (eb == NULL) | ||
641 | goto out; | ||
642 | |||
643 | if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { | ||
644 | clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); | ||
645 | btree_readahead_hook(root, eb, eb->start, -EIO); | ||
646 | } | ||
647 | free_extent_buffer(eb); | ||
648 | |||
649 | out: | ||
650 | return -EIO; /* we fixed nothing */ | ||
651 | } | ||
652 | |||
616 | static void end_workqueue_bio(struct bio *bio, int err) | 653 | static void end_workqueue_bio(struct bio *bio, int err) |
617 | { | 654 | { |
618 | struct end_io_wq *end_io_wq = bio->bi_private; | 655 | struct end_io_wq *end_io_wq = bio->bi_private; |
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page) | |||
908 | { | 945 | { |
909 | struct extent_io_tree *tree; | 946 | struct extent_io_tree *tree; |
910 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 947 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
911 | return extent_read_full_page(tree, page, btree_get_extent); | 948 | return extent_read_full_page(tree, page, btree_get_extent, 0); |
912 | } | 949 | } |
913 | 950 | ||
914 | static int btree_releasepage(struct page *page, gfp_t gfp_flags) | 951 | static int btree_releasepage(struct page *page, gfp_t gfp_flags) |
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, | |||
974 | if (!buf) | 1011 | if (!buf) |
975 | return 0; | 1012 | return 0; |
976 | read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, | 1013 | read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, |
977 | buf, 0, 0, btree_get_extent, 0); | 1014 | buf, 0, WAIT_NONE, btree_get_extent, 0); |
978 | free_extent_buffer(buf); | 1015 | free_extent_buffer(buf); |
979 | return ret; | 1016 | return ret; |
980 | } | 1017 | } |
981 | 1018 | ||
1019 | int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, | ||
1020 | int mirror_num, struct extent_buffer **eb) | ||
1021 | { | ||
1022 | struct extent_buffer *buf = NULL; | ||
1023 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
1024 | struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; | ||
1025 | int ret; | ||
1026 | |||
1027 | buf = btrfs_find_create_tree_block(root, bytenr, blocksize); | ||
1028 | if (!buf) | ||
1029 | return 0; | ||
1030 | |||
1031 | set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); | ||
1032 | |||
1033 | ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK, | ||
1034 | btree_get_extent, mirror_num); | ||
1035 | if (ret) { | ||
1036 | free_extent_buffer(buf); | ||
1037 | return ret; | ||
1038 | } | ||
1039 | |||
1040 | if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { | ||
1041 | free_extent_buffer(buf); | ||
1042 | return -EIO; | ||
1043 | } else if (extent_buffer_uptodate(io_tree, buf, NULL)) { | ||
1044 | *eb = buf; | ||
1045 | } else { | ||
1046 | free_extent_buffer(buf); | ||
1047 | } | ||
1048 | return 0; | ||
1049 | } | ||
1050 | |||
982 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, | 1051 | struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, |
983 | u64 bytenr, u32 blocksize) | 1052 | u64 bytenr, u32 blocksize) |
984 | { | 1053 | { |
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root, | |||
1135 | 1204 | ||
1136 | generation = btrfs_root_generation(&root->root_item); | 1205 | generation = btrfs_root_generation(&root->root_item); |
1137 | blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); | 1206 | blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); |
1207 | root->commit_root = NULL; | ||
1138 | root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), | 1208 | root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), |
1139 | blocksize, generation); | 1209 | blocksize, generation); |
1140 | if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { | 1210 | if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { |
1141 | free_extent_buffer(root->node); | 1211 | free_extent_buffer(root->node); |
1212 | root->node = NULL; | ||
1142 | return -EIO; | 1213 | return -EIO; |
1143 | } | 1214 | } |
1144 | root->commit_root = btrfs_root_node(root); | 1215 | root->commit_root = btrfs_root_node(root); |
@@ -1577,6 +1648,235 @@ sleep: | |||
1577 | return 0; | 1648 | return 0; |
1578 | } | 1649 | } |
1579 | 1650 | ||
1651 | /* | ||
1652 | * this will find the highest generation in the array of | ||
1653 | * root backups. The index of the highest array is returned, | ||
1654 | * or -1 if we can't find anything. | ||
1655 | * | ||
1656 | * We check to make sure the array is valid by comparing the | ||
1657 | * generation of the latest root in the array with the generation | ||
1658 | * in the super block. If they don't match we pitch it. | ||
1659 | */ | ||
1660 | static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen) | ||
1661 | { | ||
1662 | u64 cur; | ||
1663 | int newest_index = -1; | ||
1664 | struct btrfs_root_backup *root_backup; | ||
1665 | int i; | ||
1666 | |||
1667 | for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { | ||
1668 | root_backup = info->super_copy->super_roots + i; | ||
1669 | cur = btrfs_backup_tree_root_gen(root_backup); | ||
1670 | if (cur == newest_gen) | ||
1671 | newest_index = i; | ||
1672 | } | ||
1673 | |||
1674 | /* check to see if we actually wrapped around */ | ||
1675 | if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) { | ||
1676 | root_backup = info->super_copy->super_roots; | ||
1677 | cur = btrfs_backup_tree_root_gen(root_backup); | ||
1678 | if (cur == newest_gen) | ||
1679 | newest_index = 0; | ||
1680 | } | ||
1681 | return newest_index; | ||
1682 | } | ||
1683 | |||
1684 | |||
1685 | /* | ||
1686 | * find the oldest backup so we know where to store new entries | ||
1687 | * in the backup array. This will set the backup_root_index | ||
1688 | * field in the fs_info struct | ||
1689 | */ | ||
1690 | static void find_oldest_super_backup(struct btrfs_fs_info *info, | ||
1691 | u64 newest_gen) | ||
1692 | { | ||
1693 | int newest_index = -1; | ||
1694 | |||
1695 | newest_index = find_newest_super_backup(info, newest_gen); | ||
1696 | /* if there was garbage in there, just move along */ | ||
1697 | if (newest_index == -1) { | ||
1698 | info->backup_root_index = 0; | ||
1699 | } else { | ||
1700 | info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS; | ||
1701 | } | ||
1702 | } | ||
1703 | |||
1704 | /* | ||
1705 | * copy all the root pointers into the super backup array. | ||
1706 | * this will bump the backup pointer by one when it is | ||
1707 | * done | ||
1708 | */ | ||
1709 | static void backup_super_roots(struct btrfs_fs_info *info) | ||
1710 | { | ||
1711 | int next_backup; | ||
1712 | struct btrfs_root_backup *root_backup; | ||
1713 | int last_backup; | ||
1714 | |||
1715 | next_backup = info->backup_root_index; | ||
1716 | last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) % | ||
1717 | BTRFS_NUM_BACKUP_ROOTS; | ||
1718 | |||
1719 | /* | ||
1720 | * just overwrite the last backup if we're at the same generation | ||
1721 | * this happens only at umount | ||
1722 | */ | ||
1723 | root_backup = info->super_for_commit->super_roots + last_backup; | ||
1724 | if (btrfs_backup_tree_root_gen(root_backup) == | ||
1725 | btrfs_header_generation(info->tree_root->node)) | ||
1726 | next_backup = last_backup; | ||
1727 | |||
1728 | root_backup = info->super_for_commit->super_roots + next_backup; | ||
1729 | |||
1730 | /* | ||
1731 | * make sure all of our padding and empty slots get zero filled | ||
1732 | * regardless of which ones we use today | ||
1733 | */ | ||
1734 | memset(root_backup, 0, sizeof(*root_backup)); | ||
1735 | |||
1736 | info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS; | ||
1737 | |||
1738 | btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start); | ||
1739 | btrfs_set_backup_tree_root_gen(root_backup, | ||
1740 | btrfs_header_generation(info->tree_root->node)); | ||
1741 | |||
1742 | btrfs_set_backup_tree_root_level(root_backup, | ||
1743 | btrfs_header_level(info->tree_root->node)); | ||
1744 | |||
1745 | btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start); | ||
1746 | btrfs_set_backup_chunk_root_gen(root_backup, | ||
1747 | btrfs_header_generation(info->chunk_root->node)); | ||
1748 | btrfs_set_backup_chunk_root_level(root_backup, | ||
1749 | btrfs_header_level(info->chunk_root->node)); | ||
1750 | |||
1751 | btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start); | ||
1752 | btrfs_set_backup_extent_root_gen(root_backup, | ||
1753 | btrfs_header_generation(info->extent_root->node)); | ||
1754 | btrfs_set_backup_extent_root_level(root_backup, | ||
1755 | btrfs_header_level(info->extent_root->node)); | ||
1756 | |||
1757 | /* | ||
1758 | * we might commit during log recovery, which happens before we set | ||
1759 | * the fs_root. Make sure it is valid before we fill it in. | ||
1760 | */ | ||
1761 | if (info->fs_root && info->fs_root->node) { | ||
1762 | btrfs_set_backup_fs_root(root_backup, | ||
1763 | info->fs_root->node->start); | ||
1764 | btrfs_set_backup_fs_root_gen(root_backup, | ||
1765 | btrfs_header_generation(info->fs_root->node)); | ||
1766 | btrfs_set_backup_fs_root_level(root_backup, | ||
1767 | btrfs_header_level(info->fs_root->node)); | ||
1768 | } | ||
1769 | |||
1770 | btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start); | ||
1771 | btrfs_set_backup_dev_root_gen(root_backup, | ||
1772 | btrfs_header_generation(info->dev_root->node)); | ||
1773 | btrfs_set_backup_dev_root_level(root_backup, | ||
1774 | btrfs_header_level(info->dev_root->node)); | ||
1775 | |||
1776 | btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start); | ||
1777 | btrfs_set_backup_csum_root_gen(root_backup, | ||
1778 | btrfs_header_generation(info->csum_root->node)); | ||
1779 | btrfs_set_backup_csum_root_level(root_backup, | ||
1780 | btrfs_header_level(info->csum_root->node)); | ||
1781 | |||
1782 | btrfs_set_backup_total_bytes(root_backup, | ||
1783 | btrfs_super_total_bytes(info->super_copy)); | ||
1784 | btrfs_set_backup_bytes_used(root_backup, | ||
1785 | btrfs_super_bytes_used(info->super_copy)); | ||
1786 | btrfs_set_backup_num_devices(root_backup, | ||
1787 | btrfs_super_num_devices(info->super_copy)); | ||
1788 | |||
1789 | /* | ||
1790 | * if we don't copy this out to the super_copy, it won't get remembered | ||
1791 | * for the next commit | ||
1792 | */ | ||
1793 | memcpy(&info->super_copy->super_roots, | ||
1794 | &info->super_for_commit->super_roots, | ||
1795 | sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS); | ||
1796 | } | ||
1797 | |||
1798 | /* | ||
1799 | * this copies info out of the root backup array and back into | ||
1800 | * the in-memory super block. It is meant to help iterate through | ||
1801 | * the array, so you send it the number of backups you've already | ||
1802 | * tried and the last backup index you used. | ||
1803 | * | ||
1804 | * this returns -1 when it has tried all the backups | ||
1805 | */ | ||
1806 | static noinline int next_root_backup(struct btrfs_fs_info *info, | ||
1807 | struct btrfs_super_block *super, | ||
1808 | int *num_backups_tried, int *backup_index) | ||
1809 | { | ||
1810 | struct btrfs_root_backup *root_backup; | ||
1811 | int newest = *backup_index; | ||
1812 | |||
1813 | if (*num_backups_tried == 0) { | ||
1814 | u64 gen = btrfs_super_generation(super); | ||
1815 | |||
1816 | newest = find_newest_super_backup(info, gen); | ||
1817 | if (newest == -1) | ||
1818 | return -1; | ||
1819 | |||
1820 | *backup_index = newest; | ||
1821 | *num_backups_tried = 1; | ||
1822 | } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) { | ||
1823 | /* we've tried all the backups, all done */ | ||
1824 | return -1; | ||
1825 | } else { | ||
1826 | /* jump to the next oldest backup */ | ||
1827 | newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) % | ||
1828 | BTRFS_NUM_BACKUP_ROOTS; | ||
1829 | *backup_index = newest; | ||
1830 | *num_backups_tried += 1; | ||
1831 | } | ||
1832 | root_backup = super->super_roots + newest; | ||
1833 | |||
1834 | btrfs_set_super_generation(super, | ||
1835 | btrfs_backup_tree_root_gen(root_backup)); | ||
1836 | btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup)); | ||
1837 | btrfs_set_super_root_level(super, | ||
1838 | btrfs_backup_tree_root_level(root_backup)); | ||
1839 | btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup)); | ||
1840 | |||
1841 | /* | ||
1842 | * fixme: the total bytes and num_devices need to match or we should | ||
1843 | * need a fsck | ||
1844 | */ | ||
1845 | btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup)); | ||
1846 | btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup)); | ||
1847 | return 0; | ||
1848 | } | ||
1849 | |||
1850 | /* helper to cleanup tree roots */ | ||
1851 | static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) | ||
1852 | { | ||
1853 | free_extent_buffer(info->tree_root->node); | ||
1854 | free_extent_buffer(info->tree_root->commit_root); | ||
1855 | free_extent_buffer(info->dev_root->node); | ||
1856 | free_extent_buffer(info->dev_root->commit_root); | ||
1857 | free_extent_buffer(info->extent_root->node); | ||
1858 | free_extent_buffer(info->extent_root->commit_root); | ||
1859 | free_extent_buffer(info->csum_root->node); | ||
1860 | free_extent_buffer(info->csum_root->commit_root); | ||
1861 | |||
1862 | info->tree_root->node = NULL; | ||
1863 | info->tree_root->commit_root = NULL; | ||
1864 | info->dev_root->node = NULL; | ||
1865 | info->dev_root->commit_root = NULL; | ||
1866 | info->extent_root->node = NULL; | ||
1867 | info->extent_root->commit_root = NULL; | ||
1868 | info->csum_root->node = NULL; | ||
1869 | info->csum_root->commit_root = NULL; | ||
1870 | |||
1871 | if (chunk_root) { | ||
1872 | free_extent_buffer(info->chunk_root->node); | ||
1873 | free_extent_buffer(info->chunk_root->commit_root); | ||
1874 | info->chunk_root->node = NULL; | ||
1875 | info->chunk_root->commit_root = NULL; | ||
1876 | } | ||
1877 | } | ||
1878 | |||
1879 | |||
1580 | struct btrfs_root *open_ctree(struct super_block *sb, | 1880 | struct btrfs_root *open_ctree(struct super_block *sb, |
1581 | struct btrfs_fs_devices *fs_devices, | 1881 | struct btrfs_fs_devices *fs_devices, |
1582 | char *options) | 1882 | char *options) |
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1590 | u64 features; | 1890 | u64 features; |
1591 | struct btrfs_key location; | 1891 | struct btrfs_key location; |
1592 | struct buffer_head *bh; | 1892 | struct buffer_head *bh; |
1593 | struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), | 1893 | struct btrfs_super_block *disk_super; |
1594 | GFP_NOFS); | ||
1595 | struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), | ||
1596 | GFP_NOFS); | ||
1597 | struct btrfs_root *tree_root = btrfs_sb(sb); | 1894 | struct btrfs_root *tree_root = btrfs_sb(sb); |
1598 | struct btrfs_fs_info *fs_info = NULL; | 1895 | struct btrfs_fs_info *fs_info = tree_root->fs_info; |
1599 | struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), | 1896 | struct btrfs_root *extent_root; |
1600 | GFP_NOFS); | 1897 | struct btrfs_root *csum_root; |
1601 | struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), | 1898 | struct btrfs_root *chunk_root; |
1602 | GFP_NOFS); | 1899 | struct btrfs_root *dev_root; |
1603 | struct btrfs_root *log_tree_root; | 1900 | struct btrfs_root *log_tree_root; |
1604 | |||
1605 | int ret; | 1901 | int ret; |
1606 | int err = -EINVAL; | 1902 | int err = -EINVAL; |
1607 | 1903 | int num_backups_tried = 0; | |
1608 | struct btrfs_super_block *disk_super; | 1904 | int backup_index = 0; |
1609 | 1905 | ||
1610 | if (!extent_root || !tree_root || !tree_root->fs_info || | 1906 | extent_root = fs_info->extent_root = |
1611 | !chunk_root || !dev_root || !csum_root) { | 1907 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); |
1908 | csum_root = fs_info->csum_root = | ||
1909 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
1910 | chunk_root = fs_info->chunk_root = | ||
1911 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
1912 | dev_root = fs_info->dev_root = | ||
1913 | kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
1914 | |||
1915 | if (!extent_root || !csum_root || !chunk_root || !dev_root) { | ||
1612 | err = -ENOMEM; | 1916 | err = -ENOMEM; |
1613 | goto fail; | 1917 | goto fail; |
1614 | } | 1918 | } |
1615 | fs_info = tree_root->fs_info; | ||
1616 | 1919 | ||
1617 | ret = init_srcu_struct(&fs_info->subvol_srcu); | 1920 | ret = init_srcu_struct(&fs_info->subvol_srcu); |
1618 | if (ret) { | 1921 | if (ret) { |
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1648 | spin_lock_init(&fs_info->fs_roots_radix_lock); | 1951 | spin_lock_init(&fs_info->fs_roots_radix_lock); |
1649 | spin_lock_init(&fs_info->delayed_iput_lock); | 1952 | spin_lock_init(&fs_info->delayed_iput_lock); |
1650 | spin_lock_init(&fs_info->defrag_inodes_lock); | 1953 | spin_lock_init(&fs_info->defrag_inodes_lock); |
1954 | spin_lock_init(&fs_info->free_chunk_lock); | ||
1651 | mutex_init(&fs_info->reloc_mutex); | 1955 | mutex_init(&fs_info->reloc_mutex); |
1652 | 1956 | ||
1653 | init_completion(&fs_info->kobj_unregister); | 1957 | init_completion(&fs_info->kobj_unregister); |
1654 | fs_info->tree_root = tree_root; | ||
1655 | fs_info->extent_root = extent_root; | ||
1656 | fs_info->csum_root = csum_root; | ||
1657 | fs_info->chunk_root = chunk_root; | ||
1658 | fs_info->dev_root = dev_root; | ||
1659 | fs_info->fs_devices = fs_devices; | ||
1660 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); | 1958 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); |
1661 | INIT_LIST_HEAD(&fs_info->space_info); | 1959 | INIT_LIST_HEAD(&fs_info->space_info); |
1662 | btrfs_mapping_init(&fs_info->mapping_tree); | 1960 | btrfs_mapping_init(&fs_info->mapping_tree); |
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1665 | btrfs_init_block_rsv(&fs_info->trans_block_rsv); | 1963 | btrfs_init_block_rsv(&fs_info->trans_block_rsv); |
1666 | btrfs_init_block_rsv(&fs_info->chunk_block_rsv); | 1964 | btrfs_init_block_rsv(&fs_info->chunk_block_rsv); |
1667 | btrfs_init_block_rsv(&fs_info->empty_block_rsv); | 1965 | btrfs_init_block_rsv(&fs_info->empty_block_rsv); |
1668 | INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); | 1966 | btrfs_init_block_rsv(&fs_info->delayed_block_rsv); |
1669 | mutex_init(&fs_info->durable_block_rsv_mutex); | ||
1670 | atomic_set(&fs_info->nr_async_submits, 0); | 1967 | atomic_set(&fs_info->nr_async_submits, 0); |
1671 | atomic_set(&fs_info->async_delalloc_pages, 0); | 1968 | atomic_set(&fs_info->async_delalloc_pages, 0); |
1672 | atomic_set(&fs_info->async_submit_draining, 0); | 1969 | atomic_set(&fs_info->async_submit_draining, 0); |
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1677 | fs_info->metadata_ratio = 0; | 1974 | fs_info->metadata_ratio = 0; |
1678 | fs_info->defrag_inodes = RB_ROOT; | 1975 | fs_info->defrag_inodes = RB_ROOT; |
1679 | fs_info->trans_no_join = 0; | 1976 | fs_info->trans_no_join = 0; |
1977 | fs_info->free_chunk_space = 0; | ||
1978 | |||
1979 | /* readahead state */ | ||
1980 | INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); | ||
1981 | spin_lock_init(&fs_info->reada_lock); | ||
1680 | 1982 | ||
1681 | fs_info->thread_pool_size = min_t(unsigned long, | 1983 | fs_info->thread_pool_size = min_t(unsigned long, |
1682 | num_online_cpus() + 2, 8); | 1984 | num_online_cpus() + 2, 8); |
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1766 | goto fail_alloc; | 2068 | goto fail_alloc; |
1767 | } | 2069 | } |
1768 | 2070 | ||
1769 | memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); | 2071 | memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy)); |
1770 | memcpy(&fs_info->super_for_commit, &fs_info->super_copy, | 2072 | memcpy(fs_info->super_for_commit, fs_info->super_copy, |
1771 | sizeof(fs_info->super_for_commit)); | 2073 | sizeof(*fs_info->super_for_commit)); |
1772 | brelse(bh); | 2074 | brelse(bh); |
1773 | 2075 | ||
1774 | memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); | 2076 | memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE); |
1775 | 2077 | ||
1776 | disk_super = &fs_info->super_copy; | 2078 | disk_super = fs_info->super_copy; |
1777 | if (!btrfs_super_root(disk_super)) | 2079 | if (!btrfs_super_root(disk_super)) |
1778 | goto fail_alloc; | 2080 | goto fail_alloc; |
1779 | 2081 | ||
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1783 | btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); | 2085 | btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); |
1784 | 2086 | ||
1785 | /* | 2087 | /* |
2088 | * run through our array of backup supers and setup | ||
2089 | * our ring pointer to the oldest one | ||
2090 | */ | ||
2091 | generation = btrfs_super_generation(disk_super); | ||
2092 | find_oldest_super_backup(fs_info, generation); | ||
2093 | |||
2094 | /* | ||
1786 | * In the long term, we'll store the compression type in the super | 2095 | * In the long term, we'll store the compression type in the super |
1787 | * block, and it'll be used for per file compression control. | 2096 | * block, and it'll be used for per file compression control. |
1788 | */ | 2097 | */ |
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1870 | btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", | 2179 | btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", |
1871 | fs_info->thread_pool_size, | 2180 | fs_info->thread_pool_size, |
1872 | &fs_info->generic_worker); | 2181 | &fs_info->generic_worker); |
2182 | btrfs_init_workers(&fs_info->readahead_workers, "readahead", | ||
2183 | fs_info->thread_pool_size, | ||
2184 | &fs_info->generic_worker); | ||
1873 | 2185 | ||
1874 | /* | 2186 | /* |
1875 | * endios are largely parallel and should have a very | 2187 | * endios are largely parallel and should have a very |
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1880 | 2192 | ||
1881 | fs_info->endio_write_workers.idle_thresh = 2; | 2193 | fs_info->endio_write_workers.idle_thresh = 2; |
1882 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2194 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
2195 | fs_info->readahead_workers.idle_thresh = 2; | ||
1883 | 2196 | ||
1884 | btrfs_start_workers(&fs_info->workers, 1); | 2197 | btrfs_start_workers(&fs_info->workers, 1); |
1885 | btrfs_start_workers(&fs_info->generic_worker, 1); | 2198 | btrfs_start_workers(&fs_info->generic_worker, 1); |
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1893 | btrfs_start_workers(&fs_info->endio_freespace_worker, 1); | 2206 | btrfs_start_workers(&fs_info->endio_freespace_worker, 1); |
1894 | btrfs_start_workers(&fs_info->delayed_workers, 1); | 2207 | btrfs_start_workers(&fs_info->delayed_workers, 1); |
1895 | btrfs_start_workers(&fs_info->caching_workers, 1); | 2208 | btrfs_start_workers(&fs_info->caching_workers, 1); |
2209 | btrfs_start_workers(&fs_info->readahead_workers, 1); | ||
1896 | 2210 | ||
1897 | fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); | 2211 | fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); |
1898 | fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, | 2212 | fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, |
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1939 | if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { | 2253 | if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { |
1940 | printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", | 2254 | printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", |
1941 | sb->s_id); | 2255 | sb->s_id); |
1942 | goto fail_chunk_root; | 2256 | goto fail_tree_roots; |
1943 | } | 2257 | } |
1944 | btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); | 2258 | btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); |
1945 | chunk_root->commit_root = btrfs_root_node(chunk_root); | 2259 | chunk_root->commit_root = btrfs_root_node(chunk_root); |
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1954 | if (ret) { | 2268 | if (ret) { |
1955 | printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", | 2269 | printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", |
1956 | sb->s_id); | 2270 | sb->s_id); |
1957 | goto fail_chunk_root; | 2271 | goto fail_tree_roots; |
1958 | } | 2272 | } |
1959 | 2273 | ||
1960 | btrfs_close_extra_devices(fs_devices); | 2274 | btrfs_close_extra_devices(fs_devices); |
1961 | 2275 | ||
2276 | retry_root_backup: | ||
1962 | blocksize = btrfs_level_size(tree_root, | 2277 | blocksize = btrfs_level_size(tree_root, |
1963 | btrfs_super_root_level(disk_super)); | 2278 | btrfs_super_root_level(disk_super)); |
1964 | generation = btrfs_super_generation(disk_super); | 2279 | generation = btrfs_super_generation(disk_super); |
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb, | |||
1966 | tree_root->node = read_tree_block(tree_root, | 2281 | tree_root->node = read_tree_block(tree_root, |
1967 | btrfs_super_root(disk_super), | 2282 | btrfs_super_root(disk_super), |
1968 | blocksize, generation); | 2283 | blocksize, generation); |
1969 | if (!tree_root->node) | 2284 | if (!tree_root->node || |
1970 | goto fail_chunk_root; | 2285 | !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { |
1971 | if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { | ||
1972 | printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", | 2286 | printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", |
1973 | sb->s_id); | 2287 | sb->s_id); |
1974 | goto fail_tree_root; | 2288 | |
2289 | goto recovery_tree_root; | ||
1975 | } | 2290 | } |
2291 | |||
1976 | btrfs_set_root_node(&tree_root->root_item, tree_root->node); | 2292 | btrfs_set_root_node(&tree_root->root_item, tree_root->node); |
1977 | tree_root->commit_root = btrfs_root_node(tree_root); | 2293 | tree_root->commit_root = btrfs_root_node(tree_root); |
1978 | 2294 | ||
1979 | ret = find_and_setup_root(tree_root, fs_info, | 2295 | ret = find_and_setup_root(tree_root, fs_info, |
1980 | BTRFS_EXTENT_TREE_OBJECTID, extent_root); | 2296 | BTRFS_EXTENT_TREE_OBJECTID, extent_root); |
1981 | if (ret) | 2297 | if (ret) |
1982 | goto fail_tree_root; | 2298 | goto recovery_tree_root; |
1983 | extent_root->track_dirty = 1; | 2299 | extent_root->track_dirty = 1; |
1984 | 2300 | ||
1985 | ret = find_and_setup_root(tree_root, fs_info, | 2301 | ret = find_and_setup_root(tree_root, fs_info, |
1986 | BTRFS_DEV_TREE_OBJECTID, dev_root); | 2302 | BTRFS_DEV_TREE_OBJECTID, dev_root); |
1987 | if (ret) | 2303 | if (ret) |
1988 | goto fail_extent_root; | 2304 | goto recovery_tree_root; |
1989 | dev_root->track_dirty = 1; | 2305 | dev_root->track_dirty = 1; |
1990 | 2306 | ||
1991 | ret = find_and_setup_root(tree_root, fs_info, | 2307 | ret = find_and_setup_root(tree_root, fs_info, |
1992 | BTRFS_CSUM_TREE_OBJECTID, csum_root); | 2308 | BTRFS_CSUM_TREE_OBJECTID, csum_root); |
1993 | if (ret) | 2309 | if (ret) |
1994 | goto fail_dev_root; | 2310 | goto recovery_tree_root; |
1995 | 2311 | ||
1996 | csum_root->track_dirty = 1; | 2312 | csum_root->track_dirty = 1; |
1997 | 2313 | ||
@@ -2124,22 +2440,13 @@ fail_cleaner: | |||
2124 | 2440 | ||
2125 | fail_block_groups: | 2441 | fail_block_groups: |
2126 | btrfs_free_block_groups(fs_info); | 2442 | btrfs_free_block_groups(fs_info); |
2127 | free_extent_buffer(csum_root->node); | 2443 | |
2128 | free_extent_buffer(csum_root->commit_root); | 2444 | fail_tree_roots: |
2129 | fail_dev_root: | 2445 | free_root_pointers(fs_info, 1); |
2130 | free_extent_buffer(dev_root->node); | 2446 | |
2131 | free_extent_buffer(dev_root->commit_root); | ||
2132 | fail_extent_root: | ||
2133 | free_extent_buffer(extent_root->node); | ||
2134 | free_extent_buffer(extent_root->commit_root); | ||
2135 | fail_tree_root: | ||
2136 | free_extent_buffer(tree_root->node); | ||
2137 | free_extent_buffer(tree_root->commit_root); | ||
2138 | fail_chunk_root: | ||
2139 | free_extent_buffer(chunk_root->node); | ||
2140 | free_extent_buffer(chunk_root->commit_root); | ||
2141 | fail_sb_buffer: | 2447 | fail_sb_buffer: |
2142 | btrfs_stop_workers(&fs_info->generic_worker); | 2448 | btrfs_stop_workers(&fs_info->generic_worker); |
2449 | btrfs_stop_workers(&fs_info->readahead_workers); | ||
2143 | btrfs_stop_workers(&fs_info->fixup_workers); | 2450 | btrfs_stop_workers(&fs_info->fixup_workers); |
2144 | btrfs_stop_workers(&fs_info->delalloc_workers); | 2451 | btrfs_stop_workers(&fs_info->delalloc_workers); |
2145 | btrfs_stop_workers(&fs_info->workers); | 2452 | btrfs_stop_workers(&fs_info->workers); |
@@ -2152,25 +2459,37 @@ fail_sb_buffer: | |||
2152 | btrfs_stop_workers(&fs_info->delayed_workers); | 2459 | btrfs_stop_workers(&fs_info->delayed_workers); |
2153 | btrfs_stop_workers(&fs_info->caching_workers); | 2460 | btrfs_stop_workers(&fs_info->caching_workers); |
2154 | fail_alloc: | 2461 | fail_alloc: |
2155 | kfree(fs_info->delayed_root); | ||
2156 | fail_iput: | 2462 | fail_iput: |
2463 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | ||
2464 | |||
2157 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | 2465 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); |
2158 | iput(fs_info->btree_inode); | 2466 | iput(fs_info->btree_inode); |
2159 | |||
2160 | btrfs_close_devices(fs_info->fs_devices); | ||
2161 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | ||
2162 | fail_bdi: | 2467 | fail_bdi: |
2163 | bdi_destroy(&fs_info->bdi); | 2468 | bdi_destroy(&fs_info->bdi); |
2164 | fail_srcu: | 2469 | fail_srcu: |
2165 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2470 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2166 | fail: | 2471 | fail: |
2167 | kfree(extent_root); | 2472 | btrfs_close_devices(fs_info->fs_devices); |
2168 | kfree(tree_root); | 2473 | free_fs_info(fs_info); |
2169 | kfree(fs_info); | ||
2170 | kfree(chunk_root); | ||
2171 | kfree(dev_root); | ||
2172 | kfree(csum_root); | ||
2173 | return ERR_PTR(err); | 2474 | return ERR_PTR(err); |
2475 | |||
2476 | recovery_tree_root: | ||
2477 | if (!btrfs_test_opt(tree_root, RECOVERY)) | ||
2478 | goto fail_tree_roots; | ||
2479 | |||
2480 | free_root_pointers(fs_info, 0); | ||
2481 | |||
2482 | /* don't use the log in recovery mode, it won't be valid */ | ||
2483 | btrfs_set_super_log_root(disk_super, 0); | ||
2484 | |||
2485 | /* we can't trust the free space cache either */ | ||
2486 | btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); | ||
2487 | |||
2488 | ret = next_root_backup(fs_info, fs_info->super_copy, | ||
2489 | &num_backups_tried, &backup_index); | ||
2490 | if (ret == -1) | ||
2491 | goto fail_block_groups; | ||
2492 | goto retry_root_backup; | ||
2174 | } | 2493 | } |
2175 | 2494 | ||
2176 | static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) | 2495 | static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) |
@@ -2254,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2254 | int errors = 0; | 2573 | int errors = 0; |
2255 | u32 crc; | 2574 | u32 crc; |
2256 | u64 bytenr; | 2575 | u64 bytenr; |
2257 | int last_barrier = 0; | ||
2258 | 2576 | ||
2259 | if (max_mirrors == 0) | 2577 | if (max_mirrors == 0) |
2260 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; | 2578 | max_mirrors = BTRFS_SUPER_MIRROR_MAX; |
2261 | 2579 | ||
2262 | /* make sure only the last submit_bh does a barrier */ | ||
2263 | if (do_barriers) { | ||
2264 | for (i = 0; i < max_mirrors; i++) { | ||
2265 | bytenr = btrfs_sb_offset(i); | ||
2266 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= | ||
2267 | device->total_bytes) | ||
2268 | break; | ||
2269 | last_barrier = i; | ||
2270 | } | ||
2271 | } | ||
2272 | |||
2273 | for (i = 0; i < max_mirrors; i++) { | 2580 | for (i = 0; i < max_mirrors; i++) { |
2274 | bytenr = btrfs_sb_offset(i); | 2581 | bytenr = btrfs_sb_offset(i); |
2275 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) | 2582 | if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) |
@@ -2315,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, | |||
2315 | bh->b_end_io = btrfs_end_buffer_write_sync; | 2622 | bh->b_end_io = btrfs_end_buffer_write_sync; |
2316 | } | 2623 | } |
2317 | 2624 | ||
2318 | if (i == last_barrier && do_barriers) | 2625 | /* |
2319 | ret = submit_bh(WRITE_FLUSH_FUA, bh); | 2626 | * we fua the first super. The others we allow |
2320 | else | 2627 | * to go down lazy. |
2321 | ret = submit_bh(WRITE_SYNC, bh); | 2628 | */ |
2322 | 2629 | ret = submit_bh(WRITE_FUA, bh); | |
2323 | if (ret) | 2630 | if (ret) |
2324 | errors++; | 2631 | errors++; |
2325 | } | 2632 | } |
2326 | return errors < i ? 0 : -1; | 2633 | return errors < i ? 0 : -1; |
2327 | } | 2634 | } |
2328 | 2635 | ||
2636 | /* | ||
2637 | * endio for the write_dev_flush, this will wake anyone waiting | ||
2638 | * for the barrier when it is done | ||
2639 | */ | ||
2640 | static void btrfs_end_empty_barrier(struct bio *bio, int err) | ||
2641 | { | ||
2642 | if (err) { | ||
2643 | if (err == -EOPNOTSUPP) | ||
2644 | set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); | ||
2645 | clear_bit(BIO_UPTODATE, &bio->bi_flags); | ||
2646 | } | ||
2647 | if (bio->bi_private) | ||
2648 | complete(bio->bi_private); | ||
2649 | bio_put(bio); | ||
2650 | } | ||
2651 | |||
2652 | /* | ||
2653 | * trigger flushes for one the devices. If you pass wait == 0, the flushes are | ||
2654 | * sent down. With wait == 1, it waits for the previous flush. | ||
2655 | * | ||
2656 | * any device where the flush fails with eopnotsupp are flagged as not-barrier | ||
2657 | * capable | ||
2658 | */ | ||
2659 | static int write_dev_flush(struct btrfs_device *device, int wait) | ||
2660 | { | ||
2661 | struct bio *bio; | ||
2662 | int ret = 0; | ||
2663 | |||
2664 | if (device->nobarriers) | ||
2665 | return 0; | ||
2666 | |||
2667 | if (wait) { | ||
2668 | bio = device->flush_bio; | ||
2669 | if (!bio) | ||
2670 | return 0; | ||
2671 | |||
2672 | wait_for_completion(&device->flush_wait); | ||
2673 | |||
2674 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) { | ||
2675 | printk("btrfs: disabling barriers on dev %s\n", | ||
2676 | device->name); | ||
2677 | device->nobarriers = 1; | ||
2678 | } | ||
2679 | if (!bio_flagged(bio, BIO_UPTODATE)) { | ||
2680 | ret = -EIO; | ||
2681 | } | ||
2682 | |||
2683 | /* drop the reference from the wait == 0 run */ | ||
2684 | bio_put(bio); | ||
2685 | device->flush_bio = NULL; | ||
2686 | |||
2687 | return ret; | ||
2688 | } | ||
2689 | |||
2690 | /* | ||
2691 | * one reference for us, and we leave it for the | ||
2692 | * caller | ||
2693 | */ | ||
2694 | device->flush_bio = NULL;; | ||
2695 | bio = bio_alloc(GFP_NOFS, 0); | ||
2696 | if (!bio) | ||
2697 | return -ENOMEM; | ||
2698 | |||
2699 | bio->bi_end_io = btrfs_end_empty_barrier; | ||
2700 | bio->bi_bdev = device->bdev; | ||
2701 | init_completion(&device->flush_wait); | ||
2702 | bio->bi_private = &device->flush_wait; | ||
2703 | device->flush_bio = bio; | ||
2704 | |||
2705 | bio_get(bio); | ||
2706 | submit_bio(WRITE_FLUSH, bio); | ||
2707 | |||
2708 | return 0; | ||
2709 | } | ||
2710 | |||
2711 | /* | ||
2712 | * send an empty flush down to each device in parallel, | ||
2713 | * then wait for them | ||
2714 | */ | ||
2715 | static int barrier_all_devices(struct btrfs_fs_info *info) | ||
2716 | { | ||
2717 | struct list_head *head; | ||
2718 | struct btrfs_device *dev; | ||
2719 | int errors = 0; | ||
2720 | int ret; | ||
2721 | |||
2722 | /* send down all the barriers */ | ||
2723 | head = &info->fs_devices->devices; | ||
2724 | list_for_each_entry_rcu(dev, head, dev_list) { | ||
2725 | if (!dev->bdev) { | ||
2726 | errors++; | ||
2727 | continue; | ||
2728 | } | ||
2729 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2730 | continue; | ||
2731 | |||
2732 | ret = write_dev_flush(dev, 0); | ||
2733 | if (ret) | ||
2734 | errors++; | ||
2735 | } | ||
2736 | |||
2737 | /* wait for all the barriers */ | ||
2738 | list_for_each_entry_rcu(dev, head, dev_list) { | ||
2739 | if (!dev->bdev) { | ||
2740 | errors++; | ||
2741 | continue; | ||
2742 | } | ||
2743 | if (!dev->in_fs_metadata || !dev->writeable) | ||
2744 | continue; | ||
2745 | |||
2746 | ret = write_dev_flush(dev, 1); | ||
2747 | if (ret) | ||
2748 | errors++; | ||
2749 | } | ||
2750 | if (errors) | ||
2751 | return -EIO; | ||
2752 | return 0; | ||
2753 | } | ||
2754 | |||
2329 | int write_all_supers(struct btrfs_root *root, int max_mirrors) | 2755 | int write_all_supers(struct btrfs_root *root, int max_mirrors) |
2330 | { | 2756 | { |
2331 | struct list_head *head; | 2757 | struct list_head *head; |
@@ -2338,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) | |||
2338 | int total_errors = 0; | 2764 | int total_errors = 0; |
2339 | u64 flags; | 2765 | u64 flags; |
2340 | 2766 | ||
2341 | max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; | 2767 | max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; |
2342 | do_barriers = !btrfs_test_opt(root, NOBARRIER); | 2768 | do_barriers = !btrfs_test_opt(root, NOBARRIER); |
2769 | backup_super_roots(root->fs_info); | ||
2343 | 2770 | ||
2344 | sb = &root->fs_info->super_for_commit; | 2771 | sb = root->fs_info->super_for_commit; |
2345 | dev_item = &sb->dev_item; | 2772 | dev_item = &sb->dev_item; |
2346 | 2773 | ||
2347 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); | 2774 | mutex_lock(&root->fs_info->fs_devices->device_list_mutex); |
2348 | head = &root->fs_info->fs_devices->devices; | 2775 | head = &root->fs_info->fs_devices->devices; |
2776 | |||
2777 | if (do_barriers) | ||
2778 | barrier_all_devices(root->fs_info); | ||
2779 | |||
2349 | list_for_each_entry_rcu(dev, head, dev_list) { | 2780 | list_for_each_entry_rcu(dev, head, dev_list) { |
2350 | if (!dev->bdev) { | 2781 | if (!dev->bdev) { |
2351 | total_errors++; | 2782 | total_errors++; |
@@ -2545,8 +2976,6 @@ int close_ctree(struct btrfs_root *root) | |||
2545 | /* clear out the rbtree of defraggable inodes */ | 2976 | /* clear out the rbtree of defraggable inodes */ |
2546 | btrfs_run_defrag_inodes(root->fs_info); | 2977 | btrfs_run_defrag_inodes(root->fs_info); |
2547 | 2978 | ||
2548 | btrfs_put_block_group_cache(fs_info); | ||
2549 | |||
2550 | /* | 2979 | /* |
2551 | * Here come 2 situations when btrfs is broken to flip readonly: | 2980 | * Here come 2 situations when btrfs is broken to flip readonly: |
2552 | * | 2981 | * |
@@ -2572,6 +3001,8 @@ int close_ctree(struct btrfs_root *root) | |||
2572 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); | 3001 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); |
2573 | } | 3002 | } |
2574 | 3003 | ||
3004 | btrfs_put_block_group_cache(fs_info); | ||
3005 | |||
2575 | kthread_stop(root->fs_info->transaction_kthread); | 3006 | kthread_stop(root->fs_info->transaction_kthread); |
2576 | kthread_stop(root->fs_info->cleaner_kthread); | 3007 | kthread_stop(root->fs_info->cleaner_kthread); |
2577 | 3008 | ||
@@ -2603,7 +3034,6 @@ int close_ctree(struct btrfs_root *root) | |||
2603 | del_fs_roots(fs_info); | 3034 | del_fs_roots(fs_info); |
2604 | 3035 | ||
2605 | iput(fs_info->btree_inode); | 3036 | iput(fs_info->btree_inode); |
2606 | kfree(fs_info->delayed_root); | ||
2607 | 3037 | ||
2608 | btrfs_stop_workers(&fs_info->generic_worker); | 3038 | btrfs_stop_workers(&fs_info->generic_worker); |
2609 | btrfs_stop_workers(&fs_info->fixup_workers); | 3039 | btrfs_stop_workers(&fs_info->fixup_workers); |
@@ -2617,6 +3047,7 @@ int close_ctree(struct btrfs_root *root) | |||
2617 | btrfs_stop_workers(&fs_info->submit_workers); | 3047 | btrfs_stop_workers(&fs_info->submit_workers); |
2618 | btrfs_stop_workers(&fs_info->delayed_workers); | 3048 | btrfs_stop_workers(&fs_info->delayed_workers); |
2619 | btrfs_stop_workers(&fs_info->caching_workers); | 3049 | btrfs_stop_workers(&fs_info->caching_workers); |
3050 | btrfs_stop_workers(&fs_info->readahead_workers); | ||
2620 | 3051 | ||
2621 | btrfs_close_devices(fs_info->fs_devices); | 3052 | btrfs_close_devices(fs_info->fs_devices); |
2622 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 3053 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
@@ -2624,12 +3055,7 @@ int close_ctree(struct btrfs_root *root) | |||
2624 | bdi_destroy(&fs_info->bdi); | 3055 | bdi_destroy(&fs_info->bdi); |
2625 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3056 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2626 | 3057 | ||
2627 | kfree(fs_info->extent_root); | 3058 | free_fs_info(fs_info); |
2628 | kfree(fs_info->tree_root); | ||
2629 | kfree(fs_info->chunk_root); | ||
2630 | kfree(fs_info->dev_root); | ||
2631 | kfree(fs_info->csum_root); | ||
2632 | kfree(fs_info); | ||
2633 | 3059 | ||
2634 | return 0; | 3060 | return 0; |
2635 | } | 3061 | } |
@@ -2735,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) | |||
2735 | return ret; | 3161 | return ret; |
2736 | } | 3162 | } |
2737 | 3163 | ||
2738 | int btree_lock_page_hook(struct page *page) | 3164 | static int btree_lock_page_hook(struct page *page, void *data, |
3165 | void (*flush_fn)(void *)) | ||
2739 | { | 3166 | { |
2740 | struct inode *inode = page->mapping->host; | 3167 | struct inode *inode = page->mapping->host; |
2741 | struct btrfs_root *root = BTRFS_I(inode)->root; | 3168 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -2752,7 +3179,10 @@ int btree_lock_page_hook(struct page *page) | |||
2752 | if (!eb) | 3179 | if (!eb) |
2753 | goto out; | 3180 | goto out; |
2754 | 3181 | ||
2755 | btrfs_tree_lock(eb); | 3182 | if (!btrfs_try_tree_write_lock(eb)) { |
3183 | flush_fn(data); | ||
3184 | btrfs_tree_lock(eb); | ||
3185 | } | ||
2756 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); | 3186 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); |
2757 | 3187 | ||
2758 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { | 3188 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { |
@@ -2767,7 +3197,10 @@ int btree_lock_page_hook(struct page *page) | |||
2767 | btrfs_tree_unlock(eb); | 3197 | btrfs_tree_unlock(eb); |
2768 | free_extent_buffer(eb); | 3198 | free_extent_buffer(eb); |
2769 | out: | 3199 | out: |
2770 | lock_page(page); | 3200 | if (!trylock_page(page)) { |
3201 | flush_fn(data); | ||
3202 | lock_page(page); | ||
3203 | } | ||
2771 | return 0; | 3204 | return 0; |
2772 | } | 3205 | } |
2773 | 3206 | ||
@@ -3123,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) | |||
3123 | static struct extent_io_ops btree_extent_io_ops = { | 3556 | static struct extent_io_ops btree_extent_io_ops = { |
3124 | .write_cache_pages_lock_hook = btree_lock_page_hook, | 3557 | .write_cache_pages_lock_hook = btree_lock_page_hook, |
3125 | .readpage_end_io_hook = btree_readpage_end_io_hook, | 3558 | .readpage_end_io_hook = btree_readpage_end_io_hook, |
3559 | .readpage_io_failed_hook = btree_io_failed_hook, | ||
3126 | .submit_bio_hook = btree_submit_bio_hook, | 3560 | .submit_bio_hook = btree_submit_bio_hook, |
3127 | /* note we're sharing with inode.c for the merge bio hook */ | 3561 | /* note we're sharing with inode.c for the merge bio hook */ |
3128 | .merge_bio_hook = btrfs_merge_bio_hook, | 3562 | .merge_bio_hook = btrfs_merge_bio_hook, |
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bec3ea4bd67..c99d0a8f13f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | |||
40 | u32 blocksize, u64 parent_transid); | 40 | u32 blocksize, u64 parent_transid); |
41 | int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, | 41 | int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, |
42 | u64 parent_transid); | 42 | u64 parent_transid); |
43 | int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, | ||
44 | int mirror_num, struct extent_buffer **eb); | ||
43 | struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, | 45 | struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, |
44 | u64 bytenr, u32 blocksize); | 46 | u64 bytenr, u32 blocksize); |
45 | int clean_tree_block(struct btrfs_trans_handle *trans, | 47 | int clean_tree_block(struct btrfs_trans_handle *trans, |
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, | |||
83 | struct btrfs_fs_info *fs_info); | 85 | struct btrfs_fs_info *fs_info); |
84 | int btrfs_add_log_tree(struct btrfs_trans_handle *trans, | 86 | int btrfs_add_log_tree(struct btrfs_trans_handle *trans, |
85 | struct btrfs_root *root); | 87 | struct btrfs_root *root); |
86 | int btree_lock_page_hook(struct page *page); | ||
87 | |||
88 | 88 | ||
89 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 89 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
90 | void btrfs_init_lockdep(void); | 90 | void btrfs_init_lockdep(void); |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5be06a2462..2ad813674d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
24 | #include <linux/kthread.h> | 24 | #include <linux/kthread.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/ratelimit.h> | ||
26 | #include "compat.h" | 27 | #include "compat.h" |
27 | #include "hash.h" | 28 | #include "hash.h" |
28 | #include "ctree.h" | 29 | #include "ctree.h" |
@@ -52,6 +53,21 @@ enum { | |||
52 | CHUNK_ALLOC_LIMITED = 2, | 53 | CHUNK_ALLOC_LIMITED = 2, |
53 | }; | 54 | }; |
54 | 55 | ||
56 | /* | ||
57 | * Control how reservations are dealt with. | ||
58 | * | ||
59 | * RESERVE_FREE - freeing a reservation. | ||
60 | * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for | ||
61 | * ENOSPC accounting | ||
62 | * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update | ||
63 | * bytes_may_use as the ENOSPC accounting is done elsewhere | ||
64 | */ | ||
65 | enum { | ||
66 | RESERVE_FREE = 0, | ||
67 | RESERVE_ALLOC = 1, | ||
68 | RESERVE_ALLOC_NO_ACCOUNT = 2, | ||
69 | }; | ||
70 | |||
55 | static int update_block_group(struct btrfs_trans_handle *trans, | 71 | static int update_block_group(struct btrfs_trans_handle *trans, |
56 | struct btrfs_root *root, | 72 | struct btrfs_root *root, |
57 | u64 bytenr, u64 num_bytes, int alloc); | 73 | u64 bytenr, u64 num_bytes, int alloc); |
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level, | |||
81 | struct btrfs_key *key); | 97 | struct btrfs_key *key); |
82 | static void dump_space_info(struct btrfs_space_info *info, u64 bytes, | 98 | static void dump_space_info(struct btrfs_space_info *info, u64 bytes, |
83 | int dump_block_groups); | 99 | int dump_block_groups); |
100 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | ||
101 | u64 num_bytes, int reserve); | ||
84 | 102 | ||
85 | static noinline int | 103 | static noinline int |
86 | block_group_cache_done(struct btrfs_block_group_cache *cache) | 104 | block_group_cache_done(struct btrfs_block_group_cache *cache) |
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) | |||
104 | if (atomic_dec_and_test(&cache->count)) { | 122 | if (atomic_dec_and_test(&cache->count)) { |
105 | WARN_ON(cache->pinned > 0); | 123 | WARN_ON(cache->pinned > 0); |
106 | WARN_ON(cache->reserved > 0); | 124 | WARN_ON(cache->reserved > 0); |
107 | WARN_ON(cache->reserved_pinned > 0); | ||
108 | kfree(cache->free_space_ctl); | 125 | kfree(cache->free_space_ctl); |
109 | kfree(cache); | 126 | kfree(cache); |
110 | } | 127 | } |
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
450 | struct btrfs_root *root, | 467 | struct btrfs_root *root, |
451 | int load_cache_only) | 468 | int load_cache_only) |
452 | { | 469 | { |
470 | DEFINE_WAIT(wait); | ||
453 | struct btrfs_fs_info *fs_info = cache->fs_info; | 471 | struct btrfs_fs_info *fs_info = cache->fs_info; |
454 | struct btrfs_caching_control *caching_ctl; | 472 | struct btrfs_caching_control *caching_ctl; |
455 | int ret = 0; | 473 | int ret = 0; |
456 | 474 | ||
457 | smp_mb(); | 475 | caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); |
458 | if (cache->cached != BTRFS_CACHE_NO) | 476 | BUG_ON(!caching_ctl); |
477 | |||
478 | INIT_LIST_HEAD(&caching_ctl->list); | ||
479 | mutex_init(&caching_ctl->mutex); | ||
480 | init_waitqueue_head(&caching_ctl->wait); | ||
481 | caching_ctl->block_group = cache; | ||
482 | caching_ctl->progress = cache->key.objectid; | ||
483 | atomic_set(&caching_ctl->count, 1); | ||
484 | caching_ctl->work.func = caching_thread; | ||
485 | |||
486 | spin_lock(&cache->lock); | ||
487 | /* | ||
488 | * This should be a rare occasion, but this could happen I think in the | ||
489 | * case where one thread starts to load the space cache info, and then | ||
490 | * some other thread starts a transaction commit which tries to do an | ||
491 | * allocation while the other thread is still loading the space cache | ||
492 | * info. The previous loop should have kept us from choosing this block | ||
493 | * group, but if we've moved to the state where we will wait on caching | ||
494 | * block groups we need to first check if we're doing a fast load here, | ||
495 | * so we can wait for it to finish, otherwise we could end up allocating | ||
496 | * from a block group who's cache gets evicted for one reason or | ||
497 | * another. | ||
498 | */ | ||
499 | while (cache->cached == BTRFS_CACHE_FAST) { | ||
500 | struct btrfs_caching_control *ctl; | ||
501 | |||
502 | ctl = cache->caching_ctl; | ||
503 | atomic_inc(&ctl->count); | ||
504 | prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); | ||
505 | spin_unlock(&cache->lock); | ||
506 | |||
507 | schedule(); | ||
508 | |||
509 | finish_wait(&ctl->wait, &wait); | ||
510 | put_caching_control(ctl); | ||
511 | spin_lock(&cache->lock); | ||
512 | } | ||
513 | |||
514 | if (cache->cached != BTRFS_CACHE_NO) { | ||
515 | spin_unlock(&cache->lock); | ||
516 | kfree(caching_ctl); | ||
459 | return 0; | 517 | return 0; |
518 | } | ||
519 | WARN_ON(cache->caching_ctl); | ||
520 | cache->caching_ctl = caching_ctl; | ||
521 | cache->cached = BTRFS_CACHE_FAST; | ||
522 | spin_unlock(&cache->lock); | ||
460 | 523 | ||
461 | /* | 524 | /* |
462 | * We can't do the read from on-disk cache during a commit since we need | 525 | * We can't do the read from on-disk cache during a commit since we need |
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
465 | * we likely hold important locks. | 528 | * we likely hold important locks. |
466 | */ | 529 | */ |
467 | if (trans && (!trans->transaction->in_commit) && | 530 | if (trans && (!trans->transaction->in_commit) && |
468 | (root && root != root->fs_info->tree_root)) { | 531 | (root && root != root->fs_info->tree_root) && |
469 | spin_lock(&cache->lock); | 532 | btrfs_test_opt(root, SPACE_CACHE)) { |
470 | if (cache->cached != BTRFS_CACHE_NO) { | ||
471 | spin_unlock(&cache->lock); | ||
472 | return 0; | ||
473 | } | ||
474 | cache->cached = BTRFS_CACHE_STARTED; | ||
475 | spin_unlock(&cache->lock); | ||
476 | |||
477 | ret = load_free_space_cache(fs_info, cache); | 533 | ret = load_free_space_cache(fs_info, cache); |
478 | 534 | ||
479 | spin_lock(&cache->lock); | 535 | spin_lock(&cache->lock); |
480 | if (ret == 1) { | 536 | if (ret == 1) { |
537 | cache->caching_ctl = NULL; | ||
481 | cache->cached = BTRFS_CACHE_FINISHED; | 538 | cache->cached = BTRFS_CACHE_FINISHED; |
482 | cache->last_byte_to_unpin = (u64)-1; | 539 | cache->last_byte_to_unpin = (u64)-1; |
483 | } else { | 540 | } else { |
484 | cache->cached = BTRFS_CACHE_NO; | 541 | if (load_cache_only) { |
542 | cache->caching_ctl = NULL; | ||
543 | cache->cached = BTRFS_CACHE_NO; | ||
544 | } else { | ||
545 | cache->cached = BTRFS_CACHE_STARTED; | ||
546 | } | ||
485 | } | 547 | } |
486 | spin_unlock(&cache->lock); | 548 | spin_unlock(&cache->lock); |
549 | wake_up(&caching_ctl->wait); | ||
487 | if (ret == 1) { | 550 | if (ret == 1) { |
551 | put_caching_control(caching_ctl); | ||
488 | free_excluded_extents(fs_info->extent_root, cache); | 552 | free_excluded_extents(fs_info->extent_root, cache); |
489 | return 0; | 553 | return 0; |
490 | } | 554 | } |
555 | } else { | ||
556 | /* | ||
557 | * We are not going to do the fast caching, set cached to the | ||
558 | * appropriate value and wakeup any waiters. | ||
559 | */ | ||
560 | spin_lock(&cache->lock); | ||
561 | if (load_cache_only) { | ||
562 | cache->caching_ctl = NULL; | ||
563 | cache->cached = BTRFS_CACHE_NO; | ||
564 | } else { | ||
565 | cache->cached = BTRFS_CACHE_STARTED; | ||
566 | } | ||
567 | spin_unlock(&cache->lock); | ||
568 | wake_up(&caching_ctl->wait); | ||
491 | } | 569 | } |
492 | 570 | ||
493 | if (load_cache_only) | 571 | if (load_cache_only) { |
494 | return 0; | 572 | put_caching_control(caching_ctl); |
495 | |||
496 | caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); | ||
497 | BUG_ON(!caching_ctl); | ||
498 | |||
499 | INIT_LIST_HEAD(&caching_ctl->list); | ||
500 | mutex_init(&caching_ctl->mutex); | ||
501 | init_waitqueue_head(&caching_ctl->wait); | ||
502 | caching_ctl->block_group = cache; | ||
503 | caching_ctl->progress = cache->key.objectid; | ||
504 | /* one for caching kthread, one for caching block group list */ | ||
505 | atomic_set(&caching_ctl->count, 2); | ||
506 | caching_ctl->work.func = caching_thread; | ||
507 | |||
508 | spin_lock(&cache->lock); | ||
509 | if (cache->cached != BTRFS_CACHE_NO) { | ||
510 | spin_unlock(&cache->lock); | ||
511 | kfree(caching_ctl); | ||
512 | return 0; | 573 | return 0; |
513 | } | 574 | } |
514 | cache->caching_ctl = caching_ctl; | ||
515 | cache->cached = BTRFS_CACHE_STARTED; | ||
516 | spin_unlock(&cache->lock); | ||
517 | 575 | ||
518 | down_write(&fs_info->extent_commit_sem); | 576 | down_write(&fs_info->extent_commit_sem); |
577 | atomic_inc(&caching_ctl->count); | ||
519 | list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); | 578 | list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); |
520 | up_write(&fs_info->extent_commit_sem); | 579 | up_write(&fs_info->extent_commit_sem); |
521 | 580 | ||
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1770 | { | 1829 | { |
1771 | int ret; | 1830 | int ret; |
1772 | u64 discarded_bytes = 0; | 1831 | u64 discarded_bytes = 0; |
1773 | struct btrfs_multi_bio *multi = NULL; | 1832 | struct btrfs_bio *bbio = NULL; |
1774 | 1833 | ||
1775 | 1834 | ||
1776 | /* Tell the block device(s) that the sectors can be discarded */ | 1835 | /* Tell the block device(s) that the sectors can be discarded */ |
1777 | ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, | 1836 | ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, |
1778 | bytenr, &num_bytes, &multi, 0); | 1837 | bytenr, &num_bytes, &bbio, 0); |
1779 | if (!ret) { | 1838 | if (!ret) { |
1780 | struct btrfs_bio_stripe *stripe = multi->stripes; | 1839 | struct btrfs_bio_stripe *stripe = bbio->stripes; |
1781 | int i; | 1840 | int i; |
1782 | 1841 | ||
1783 | 1842 | ||
1784 | for (i = 0; i < multi->num_stripes; i++, stripe++) { | 1843 | for (i = 0; i < bbio->num_stripes; i++, stripe++) { |
1785 | if (!stripe->dev->can_discard) | 1844 | if (!stripe->dev->can_discard) |
1786 | continue; | 1845 | continue; |
1787 | 1846 | ||
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1800 | */ | 1859 | */ |
1801 | ret = 0; | 1860 | ret = 0; |
1802 | } | 1861 | } |
1803 | kfree(multi); | 1862 | kfree(bbio); |
1804 | } | 1863 | } |
1805 | 1864 | ||
1806 | if (actual_bytes) | 1865 | if (actual_bytes) |
@@ -2700,6 +2759,13 @@ again: | |||
2700 | goto again; | 2759 | goto again; |
2701 | } | 2760 | } |
2702 | 2761 | ||
2762 | /* We've already setup this transaction, go ahead and exit */ | ||
2763 | if (block_group->cache_generation == trans->transid && | ||
2764 | i_size_read(inode)) { | ||
2765 | dcs = BTRFS_DC_SETUP; | ||
2766 | goto out_put; | ||
2767 | } | ||
2768 | |||
2703 | /* | 2769 | /* |
2704 | * We want to set the generation to 0, that way if anything goes wrong | 2770 | * We want to set the generation to 0, that way if anything goes wrong |
2705 | * from here on out we know not to trust this cache when we load up next | 2771 | * from here on out we know not to trust this cache when we load up next |
@@ -2749,12 +2815,15 @@ again: | |||
2749 | if (!ret) | 2815 | if (!ret) |
2750 | dcs = BTRFS_DC_SETUP; | 2816 | dcs = BTRFS_DC_SETUP; |
2751 | btrfs_free_reserved_data_space(inode, num_pages); | 2817 | btrfs_free_reserved_data_space(inode, num_pages); |
2818 | |||
2752 | out_put: | 2819 | out_put: |
2753 | iput(inode); | 2820 | iput(inode); |
2754 | out_free: | 2821 | out_free: |
2755 | btrfs_release_path(path); | 2822 | btrfs_release_path(path); |
2756 | out: | 2823 | out: |
2757 | spin_lock(&block_group->lock); | 2824 | spin_lock(&block_group->lock); |
2825 | if (!ret) | ||
2826 | block_group->cache_generation = trans->transid; | ||
2758 | block_group->disk_cache_state = dcs; | 2827 | block_group->disk_cache_state = dcs; |
2759 | spin_unlock(&block_group->lock); | 2828 | spin_unlock(&block_group->lock); |
2760 | 2829 | ||
@@ -3122,16 +3191,13 @@ commit_trans: | |||
3122 | return -ENOSPC; | 3191 | return -ENOSPC; |
3123 | } | 3192 | } |
3124 | data_sinfo->bytes_may_use += bytes; | 3193 | data_sinfo->bytes_may_use += bytes; |
3125 | BTRFS_I(inode)->reserved_bytes += bytes; | ||
3126 | spin_unlock(&data_sinfo->lock); | 3194 | spin_unlock(&data_sinfo->lock); |
3127 | 3195 | ||
3128 | return 0; | 3196 | return 0; |
3129 | } | 3197 | } |
3130 | 3198 | ||
3131 | /* | 3199 | /* |
3132 | * called when we are clearing an delalloc extent from the | 3200 | * Called if we need to clear a data reservation for this inode. |
3133 | * inode's io_tree or there was an error for whatever reason | ||
3134 | * after calling btrfs_check_data_free_space | ||
3135 | */ | 3201 | */ |
3136 | void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) | 3202 | void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) |
3137 | { | 3203 | { |
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) | |||
3144 | data_sinfo = BTRFS_I(inode)->space_info; | 3210 | data_sinfo = BTRFS_I(inode)->space_info; |
3145 | spin_lock(&data_sinfo->lock); | 3211 | spin_lock(&data_sinfo->lock); |
3146 | data_sinfo->bytes_may_use -= bytes; | 3212 | data_sinfo->bytes_may_use -= bytes; |
3147 | BTRFS_I(inode)->reserved_bytes -= bytes; | ||
3148 | spin_unlock(&data_sinfo->lock); | 3213 | spin_unlock(&data_sinfo->lock); |
3149 | } | 3214 | } |
3150 | 3215 | ||
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root, | |||
3165 | struct btrfs_space_info *sinfo, u64 alloc_bytes, | 3230 | struct btrfs_space_info *sinfo, u64 alloc_bytes, |
3166 | int force) | 3231 | int force) |
3167 | { | 3232 | { |
3233 | struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; | ||
3168 | u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; | 3234 | u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; |
3169 | u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; | 3235 | u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; |
3170 | u64 thresh; | 3236 | u64 thresh; |
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root, | |||
3173 | return 1; | 3239 | return 1; |
3174 | 3240 | ||
3175 | /* | 3241 | /* |
3242 | * We need to take into account the global rsv because for all intents | ||
3243 | * and purposes it's used space. Don't worry about locking the | ||
3244 | * global_rsv, it doesn't change except when the transaction commits. | ||
3245 | */ | ||
3246 | num_allocated += global_rsv->size; | ||
3247 | |||
3248 | /* | ||
3176 | * in limited mode, we want to have some free space up to | 3249 | * in limited mode, we want to have some free space up to |
3177 | * about 1% of the FS size. | 3250 | * about 1% of the FS size. |
3178 | */ | 3251 | */ |
3179 | if (force == CHUNK_ALLOC_LIMITED) { | 3252 | if (force == CHUNK_ALLOC_LIMITED) { |
3180 | thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); | 3253 | thresh = btrfs_super_total_bytes(root->fs_info->super_copy); |
3181 | thresh = max_t(u64, 64 * 1024 * 1024, | 3254 | thresh = max_t(u64, 64 * 1024 * 1024, |
3182 | div_factor_fine(thresh, 1)); | 3255 | div_factor_fine(thresh, 1)); |
3183 | 3256 | ||
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root, | |||
3199 | if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) | 3272 | if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) |
3200 | return 0; | 3273 | return 0; |
3201 | 3274 | ||
3202 | thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); | 3275 | thresh = btrfs_super_total_bytes(root->fs_info->super_copy); |
3203 | 3276 | ||
3204 | /* 256MB or 5% of the FS */ | 3277 | /* 256MB or 5% of the FS */ |
3205 | thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); | 3278 | thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); |
@@ -3302,24 +3375,26 @@ out: | |||
3302 | /* | 3375 | /* |
3303 | * shrink metadata reservation for delalloc | 3376 | * shrink metadata reservation for delalloc |
3304 | */ | 3377 | */ |
3305 | static int shrink_delalloc(struct btrfs_trans_handle *trans, | 3378 | static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, |
3306 | struct btrfs_root *root, u64 to_reclaim, int sync) | 3379 | bool wait_ordered) |
3307 | { | 3380 | { |
3308 | struct btrfs_block_rsv *block_rsv; | 3381 | struct btrfs_block_rsv *block_rsv; |
3309 | struct btrfs_space_info *space_info; | 3382 | struct btrfs_space_info *space_info; |
3383 | struct btrfs_trans_handle *trans; | ||
3310 | u64 reserved; | 3384 | u64 reserved; |
3311 | u64 max_reclaim; | 3385 | u64 max_reclaim; |
3312 | u64 reclaimed = 0; | 3386 | u64 reclaimed = 0; |
3313 | long time_left; | 3387 | long time_left; |
3314 | int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; | 3388 | unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; |
3315 | int loops = 0; | 3389 | int loops = 0; |
3316 | unsigned long progress; | 3390 | unsigned long progress; |
3317 | 3391 | ||
3392 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
3318 | block_rsv = &root->fs_info->delalloc_block_rsv; | 3393 | block_rsv = &root->fs_info->delalloc_block_rsv; |
3319 | space_info = block_rsv->space_info; | 3394 | space_info = block_rsv->space_info; |
3320 | 3395 | ||
3321 | smp_mb(); | 3396 | smp_mb(); |
3322 | reserved = space_info->bytes_reserved; | 3397 | reserved = space_info->bytes_may_use; |
3323 | progress = space_info->reservation_progress; | 3398 | progress = space_info->reservation_progress; |
3324 | 3399 | ||
3325 | if (reserved == 0) | 3400 | if (reserved == 0) |
@@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, | |||
3334 | } | 3409 | } |
3335 | 3410 | ||
3336 | max_reclaim = min(reserved, to_reclaim); | 3411 | max_reclaim = min(reserved, to_reclaim); |
3337 | 3412 | nr_pages = max_t(unsigned long, nr_pages, | |
3413 | max_reclaim >> PAGE_CACHE_SHIFT); | ||
3338 | while (loops < 1024) { | 3414 | while (loops < 1024) { |
3339 | /* have the flusher threads jump in and do some IO */ | 3415 | /* have the flusher threads jump in and do some IO */ |
3340 | smp_mb(); | 3416 | smp_mb(); |
3341 | nr_pages = min_t(unsigned long, nr_pages, | 3417 | nr_pages = min_t(unsigned long, nr_pages, |
3342 | root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); | 3418 | root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); |
3343 | writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); | 3419 | writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages, |
3420 | WB_REASON_FS_FREE_SPACE); | ||
3344 | 3421 | ||
3345 | spin_lock(&space_info->lock); | 3422 | spin_lock(&space_info->lock); |
3346 | if (reserved > space_info->bytes_reserved) | 3423 | if (reserved > space_info->bytes_may_use) |
3347 | reclaimed += reserved - space_info->bytes_reserved; | 3424 | reclaimed += reserved - space_info->bytes_may_use; |
3348 | reserved = space_info->bytes_reserved; | 3425 | reserved = space_info->bytes_may_use; |
3349 | spin_unlock(&space_info->lock); | 3426 | spin_unlock(&space_info->lock); |
3350 | 3427 | ||
3351 | loops++; | 3428 | loops++; |
@@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, | |||
3356 | if (trans && trans->transaction->blocked) | 3433 | if (trans && trans->transaction->blocked) |
3357 | return -EAGAIN; | 3434 | return -EAGAIN; |
3358 | 3435 | ||
3359 | time_left = schedule_timeout_interruptible(1); | 3436 | if (wait_ordered && !trans) { |
3437 | btrfs_wait_ordered_extents(root, 0, 0); | ||
3438 | } else { | ||
3439 | time_left = schedule_timeout_interruptible(1); | ||
3360 | 3440 | ||
3361 | /* We were interrupted, exit */ | 3441 | /* We were interrupted, exit */ |
3362 | if (time_left) | 3442 | if (time_left) |
3363 | break; | 3443 | break; |
3444 | } | ||
3364 | 3445 | ||
3365 | /* we've kicked the IO a few times, if anything has been freed, | 3446 | /* we've kicked the IO a few times, if anything has been freed, |
3366 | * exit. There is no sense in looping here for a long time | 3447 | * exit. There is no sense in looping here for a long time |
@@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, | |||
3375 | } | 3456 | } |
3376 | 3457 | ||
3377 | } | 3458 | } |
3378 | if (reclaimed >= to_reclaim && !trans) | 3459 | |
3379 | btrfs_wait_ordered_extents(root, 0, 0); | ||
3380 | return reclaimed >= to_reclaim; | 3460 | return reclaimed >= to_reclaim; |
3381 | } | 3461 | } |
3382 | 3462 | ||
3383 | /* | 3463 | /** |
3384 | * Retries tells us how many times we've called reserve_metadata_bytes. The | 3464 | * maybe_commit_transaction - possibly commit the transaction if its ok to |
3385 | * idea is if this is the first call (retries == 0) then we will add to our | 3465 | * @root - the root we're allocating for |
3386 | * reserved count if we can't make the allocation in order to hold our place | 3466 | * @bytes - the number of bytes we want to reserve |
3387 | * while we go and try and free up space. That way for retries > 1 we don't try | 3467 | * @force - force the commit |
3388 | * and add space, we just check to see if the amount of unused space is >= the | ||
3389 | * total space, meaning that our reservation is valid. | ||
3390 | * | 3468 | * |
3391 | * However if we don't intend to retry this reservation, pass -1 as retries so | 3469 | * This will check to make sure that committing the transaction will actually |
3392 | * that it short circuits this logic. | 3470 | * get us somewhere and then commit the transaction if it does. Otherwise it |
3471 | * will return -ENOSPC. | ||
3393 | */ | 3472 | */ |
3394 | static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, | 3473 | static int may_commit_transaction(struct btrfs_root *root, |
3395 | struct btrfs_root *root, | 3474 | struct btrfs_space_info *space_info, |
3475 | u64 bytes, int force) | ||
3476 | { | ||
3477 | struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; | ||
3478 | struct btrfs_trans_handle *trans; | ||
3479 | |||
3480 | trans = (struct btrfs_trans_handle *)current->journal_info; | ||
3481 | if (trans) | ||
3482 | return -EAGAIN; | ||
3483 | |||
3484 | if (force) | ||
3485 | goto commit; | ||
3486 | |||
3487 | /* See if there is enough pinned space to make this reservation */ | ||
3488 | spin_lock(&space_info->lock); | ||
3489 | if (space_info->bytes_pinned >= bytes) { | ||
3490 | spin_unlock(&space_info->lock); | ||
3491 | goto commit; | ||
3492 | } | ||
3493 | spin_unlock(&space_info->lock); | ||
3494 | |||
3495 | /* | ||
3496 | * See if there is some space in the delayed insertion reservation for | ||
3497 | * this reservation. | ||
3498 | */ | ||
3499 | if (space_info != delayed_rsv->space_info) | ||
3500 | return -ENOSPC; | ||
3501 | |||
3502 | spin_lock(&delayed_rsv->lock); | ||
3503 | if (delayed_rsv->size < bytes) { | ||
3504 | spin_unlock(&delayed_rsv->lock); | ||
3505 | return -ENOSPC; | ||
3506 | } | ||
3507 | spin_unlock(&delayed_rsv->lock); | ||
3508 | |||
3509 | commit: | ||
3510 | trans = btrfs_join_transaction(root); | ||
3511 | if (IS_ERR(trans)) | ||
3512 | return -ENOSPC; | ||
3513 | |||
3514 | return btrfs_commit_transaction(trans, root); | ||
3515 | } | ||
3516 | |||
3517 | /** | ||
3518 | * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space | ||
3519 | * @root - the root we're allocating for | ||
3520 | * @block_rsv - the block_rsv we're allocating for | ||
3521 | * @orig_bytes - the number of bytes we want | ||
3522 | * @flush - wether or not we can flush to make our reservation | ||
3523 | * | ||
3524 | * This will reserve orgi_bytes number of bytes from the space info associated | ||
3525 | * with the block_rsv. If there is not enough space it will make an attempt to | ||
3526 | * flush out space to make room. It will do this by flushing delalloc if | ||
3527 | * possible or committing the transaction. If flush is 0 then no attempts to | ||
3528 | * regain reservations will be made and this will fail if there is not enough | ||
3529 | * space already. | ||
3530 | */ | ||
3531 | static int reserve_metadata_bytes(struct btrfs_root *root, | ||
3396 | struct btrfs_block_rsv *block_rsv, | 3532 | struct btrfs_block_rsv *block_rsv, |
3397 | u64 orig_bytes, int flush) | 3533 | u64 orig_bytes, int flush) |
3398 | { | 3534 | { |
3399 | struct btrfs_space_info *space_info = block_rsv->space_info; | 3535 | struct btrfs_space_info *space_info = block_rsv->space_info; |
3400 | u64 unused; | 3536 | u64 used; |
3401 | u64 num_bytes = orig_bytes; | 3537 | u64 num_bytes = orig_bytes; |
3402 | int retries = 0; | 3538 | int retries = 0; |
3403 | int ret = 0; | 3539 | int ret = 0; |
3404 | bool committed = false; | 3540 | bool committed = false; |
3405 | bool flushing = false; | 3541 | bool flushing = false; |
3542 | bool wait_ordered = false; | ||
3406 | 3543 | ||
3407 | again: | 3544 | again: |
3408 | ret = 0; | 3545 | ret = 0; |
@@ -3419,7 +3556,7 @@ again: | |||
3419 | * deadlock since we are waiting for the flusher to finish, but | 3556 | * deadlock since we are waiting for the flusher to finish, but |
3420 | * hold the current transaction open. | 3557 | * hold the current transaction open. |
3421 | */ | 3558 | */ |
3422 | if (trans) | 3559 | if (current->journal_info) |
3423 | return -EAGAIN; | 3560 | return -EAGAIN; |
3424 | ret = wait_event_interruptible(space_info->wait, | 3561 | ret = wait_event_interruptible(space_info->wait, |
3425 | !space_info->flush); | 3562 | !space_info->flush); |
@@ -3431,9 +3568,9 @@ again: | |||
3431 | } | 3568 | } |
3432 | 3569 | ||
3433 | ret = -ENOSPC; | 3570 | ret = -ENOSPC; |
3434 | unused = space_info->bytes_used + space_info->bytes_reserved + | 3571 | used = space_info->bytes_used + space_info->bytes_reserved + |
3435 | space_info->bytes_pinned + space_info->bytes_readonly + | 3572 | space_info->bytes_pinned + space_info->bytes_readonly + |
3436 | space_info->bytes_may_use; | 3573 | space_info->bytes_may_use; |
3437 | 3574 | ||
3438 | /* | 3575 | /* |
3439 | * The idea here is that we've not already over-reserved the block group | 3576 | * The idea here is that we've not already over-reserved the block group |
@@ -3442,10 +3579,9 @@ again: | |||
3442 | * lets start flushing stuff first and then come back and try to make | 3579 | * lets start flushing stuff first and then come back and try to make |
3443 | * our reservation. | 3580 | * our reservation. |
3444 | */ | 3581 | */ |
3445 | if (unused <= space_info->total_bytes) { | 3582 | if (used <= space_info->total_bytes) { |
3446 | unused = space_info->total_bytes - unused; | 3583 | if (used + orig_bytes <= space_info->total_bytes) { |
3447 | if (unused >= num_bytes) { | 3584 | space_info->bytes_may_use += orig_bytes; |
3448 | space_info->bytes_reserved += orig_bytes; | ||
3449 | ret = 0; | 3585 | ret = 0; |
3450 | } else { | 3586 | } else { |
3451 | /* | 3587 | /* |
@@ -3461,10 +3597,64 @@ again: | |||
3461 | * amount plus the amount of bytes that we need for this | 3597 | * amount plus the amount of bytes that we need for this |
3462 | * reservation. | 3598 | * reservation. |
3463 | */ | 3599 | */ |
3464 | num_bytes = unused - space_info->total_bytes + | 3600 | wait_ordered = true; |
3601 | num_bytes = used - space_info->total_bytes + | ||
3465 | (orig_bytes * (retries + 1)); | 3602 | (orig_bytes * (retries + 1)); |
3466 | } | 3603 | } |
3467 | 3604 | ||
3605 | if (ret) { | ||
3606 | u64 profile = btrfs_get_alloc_profile(root, 0); | ||
3607 | u64 avail; | ||
3608 | |||
3609 | /* | ||
3610 | * If we have a lot of space that's pinned, don't bother doing | ||
3611 | * the overcommit dance yet and just commit the transaction. | ||
3612 | */ | ||
3613 | avail = (space_info->total_bytes - space_info->bytes_used) * 8; | ||
3614 | do_div(avail, 10); | ||
3615 | if (space_info->bytes_pinned >= avail && flush && !committed) { | ||
3616 | space_info->flush = 1; | ||
3617 | flushing = true; | ||
3618 | spin_unlock(&space_info->lock); | ||
3619 | ret = may_commit_transaction(root, space_info, | ||
3620 | orig_bytes, 1); | ||
3621 | if (ret) | ||
3622 | goto out; | ||
3623 | committed = true; | ||
3624 | goto again; | ||
3625 | } | ||
3626 | |||
3627 | spin_lock(&root->fs_info->free_chunk_lock); | ||
3628 | avail = root->fs_info->free_chunk_space; | ||
3629 | |||
3630 | /* | ||
3631 | * If we have dup, raid1 or raid10 then only half of the free | ||
3632 | * space is actually useable. | ||
3633 | */ | ||
3634 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | ||
3635 | BTRFS_BLOCK_GROUP_RAID1 | | ||
3636 | BTRFS_BLOCK_GROUP_RAID10)) | ||
3637 | avail >>= 1; | ||
3638 | |||
3639 | /* | ||
3640 | * If we aren't flushing don't let us overcommit too much, say | ||
3641 | * 1/8th of the space. If we can flush, let it overcommit up to | ||
3642 | * 1/2 of the space. | ||
3643 | */ | ||
3644 | if (flush) | ||
3645 | avail >>= 3; | ||
3646 | else | ||
3647 | avail >>= 1; | ||
3648 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
3649 | |||
3650 | if (used + num_bytes < space_info->total_bytes + avail) { | ||
3651 | space_info->bytes_may_use += orig_bytes; | ||
3652 | ret = 0; | ||
3653 | } else { | ||
3654 | wait_ordered = true; | ||
3655 | } | ||
3656 | } | ||
3657 | |||
3468 | /* | 3658 | /* |
3469 | * Couldn't make our reservation, save our place so while we're trying | 3659 | * Couldn't make our reservation, save our place so while we're trying |
3470 | * to reclaim space we can actually use it instead of somebody else | 3660 | * to reclaim space we can actually use it instead of somebody else |
@@ -3484,7 +3674,7 @@ again: | |||
3484 | * We do synchronous shrinking since we don't actually unreserve | 3674 | * We do synchronous shrinking since we don't actually unreserve |
3485 | * metadata until after the IO is completed. | 3675 | * metadata until after the IO is completed. |
3486 | */ | 3676 | */ |
3487 | ret = shrink_delalloc(trans, root, num_bytes, 1); | 3677 | ret = shrink_delalloc(root, num_bytes, wait_ordered); |
3488 | if (ret < 0) | 3678 | if (ret < 0) |
3489 | goto out; | 3679 | goto out; |
3490 | 3680 | ||
@@ -3496,35 +3686,17 @@ again: | |||
3496 | * so go back around and try again. | 3686 | * so go back around and try again. |
3497 | */ | 3687 | */ |
3498 | if (retries < 2) { | 3688 | if (retries < 2) { |
3689 | wait_ordered = true; | ||
3499 | retries++; | 3690 | retries++; |
3500 | goto again; | 3691 | goto again; |
3501 | } | 3692 | } |
3502 | 3693 | ||
3503 | /* | ||
3504 | * Not enough space to be reclaimed, don't bother committing the | ||
3505 | * transaction. | ||
3506 | */ | ||
3507 | spin_lock(&space_info->lock); | ||
3508 | if (space_info->bytes_pinned < orig_bytes) | ||
3509 | ret = -ENOSPC; | ||
3510 | spin_unlock(&space_info->lock); | ||
3511 | if (ret) | ||
3512 | goto out; | ||
3513 | |||
3514 | ret = -EAGAIN; | ||
3515 | if (trans) | ||
3516 | goto out; | ||
3517 | |||
3518 | ret = -ENOSPC; | 3694 | ret = -ENOSPC; |
3519 | if (committed) | 3695 | if (committed) |
3520 | goto out; | 3696 | goto out; |
3521 | 3697 | ||
3522 | trans = btrfs_join_transaction(root); | 3698 | ret = may_commit_transaction(root, space_info, orig_bytes, 0); |
3523 | if (IS_ERR(trans)) | ||
3524 | goto out; | ||
3525 | ret = btrfs_commit_transaction(trans, root); | ||
3526 | if (!ret) { | 3699 | if (!ret) { |
3527 | trans = NULL; | ||
3528 | committed = true; | 3700 | committed = true; |
3529 | goto again; | 3701 | goto again; |
3530 | } | 3702 | } |
@@ -3542,10 +3714,12 @@ out: | |||
3542 | static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, | 3714 | static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, |
3543 | struct btrfs_root *root) | 3715 | struct btrfs_root *root) |
3544 | { | 3716 | { |
3545 | struct btrfs_block_rsv *block_rsv; | 3717 | struct btrfs_block_rsv *block_rsv = NULL; |
3546 | if (root->ref_cows) | 3718 | |
3719 | if (root->ref_cows || root == root->fs_info->csum_root) | ||
3547 | block_rsv = trans->block_rsv; | 3720 | block_rsv = trans->block_rsv; |
3548 | else | 3721 | |
3722 | if (!block_rsv) | ||
3549 | block_rsv = root->block_rsv; | 3723 | block_rsv = root->block_rsv; |
3550 | 3724 | ||
3551 | if (!block_rsv) | 3725 | if (!block_rsv) |
@@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, | |||
3616 | } | 3790 | } |
3617 | if (num_bytes) { | 3791 | if (num_bytes) { |
3618 | spin_lock(&space_info->lock); | 3792 | spin_lock(&space_info->lock); |
3619 | space_info->bytes_reserved -= num_bytes; | 3793 | space_info->bytes_may_use -= num_bytes; |
3620 | space_info->reservation_progress++; | 3794 | space_info->reservation_progress++; |
3621 | spin_unlock(&space_info->lock); | 3795 | spin_unlock(&space_info->lock); |
3622 | } | 3796 | } |
@@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) | |||
3640 | { | 3814 | { |
3641 | memset(rsv, 0, sizeof(*rsv)); | 3815 | memset(rsv, 0, sizeof(*rsv)); |
3642 | spin_lock_init(&rsv->lock); | 3816 | spin_lock_init(&rsv->lock); |
3643 | atomic_set(&rsv->usage, 1); | ||
3644 | rsv->priority = 6; | ||
3645 | INIT_LIST_HEAD(&rsv->list); | ||
3646 | } | 3817 | } |
3647 | 3818 | ||
3648 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) | 3819 | struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) |
@@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) | |||
3663 | void btrfs_free_block_rsv(struct btrfs_root *root, | 3834 | void btrfs_free_block_rsv(struct btrfs_root *root, |
3664 | struct btrfs_block_rsv *rsv) | 3835 | struct btrfs_block_rsv *rsv) |
3665 | { | 3836 | { |
3666 | if (rsv && atomic_dec_and_test(&rsv->usage)) { | 3837 | btrfs_block_rsv_release(root, rsv, (u64)-1); |
3667 | btrfs_block_rsv_release(root, rsv, (u64)-1); | 3838 | kfree(rsv); |
3668 | if (!rsv->durable) | ||
3669 | kfree(rsv); | ||
3670 | } | ||
3671 | } | ||
3672 | |||
3673 | /* | ||
3674 | * make the block_rsv struct be able to capture freed space. | ||
3675 | * the captured space will re-add to the the block_rsv struct | ||
3676 | * after transaction commit | ||
3677 | */ | ||
3678 | void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, | ||
3679 | struct btrfs_block_rsv *block_rsv) | ||
3680 | { | ||
3681 | block_rsv->durable = 1; | ||
3682 | mutex_lock(&fs_info->durable_block_rsv_mutex); | ||
3683 | list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); | ||
3684 | mutex_unlock(&fs_info->durable_block_rsv_mutex); | ||
3685 | } | 3839 | } |
3686 | 3840 | ||
3687 | int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, | 3841 | static inline int __block_rsv_add(struct btrfs_root *root, |
3688 | struct btrfs_root *root, | 3842 | struct btrfs_block_rsv *block_rsv, |
3689 | struct btrfs_block_rsv *block_rsv, | 3843 | u64 num_bytes, int flush) |
3690 | u64 num_bytes) | ||
3691 | { | 3844 | { |
3692 | int ret; | 3845 | int ret; |
3693 | 3846 | ||
3694 | if (num_bytes == 0) | 3847 | if (num_bytes == 0) |
3695 | return 0; | 3848 | return 0; |
3696 | 3849 | ||
3697 | ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); | 3850 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); |
3698 | if (!ret) { | 3851 | if (!ret) { |
3699 | block_rsv_add_bytes(block_rsv, num_bytes, 1); | 3852 | block_rsv_add_bytes(block_rsv, num_bytes, 1); |
3700 | return 0; | 3853 | return 0; |
@@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, | |||
3703 | return ret; | 3856 | return ret; |
3704 | } | 3857 | } |
3705 | 3858 | ||
3706 | int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, | 3859 | int btrfs_block_rsv_add(struct btrfs_root *root, |
3707 | struct btrfs_root *root, | 3860 | struct btrfs_block_rsv *block_rsv, |
3708 | struct btrfs_block_rsv *block_rsv, | 3861 | u64 num_bytes) |
3709 | u64 min_reserved, int min_factor) | 3862 | { |
3863 | return __block_rsv_add(root, block_rsv, num_bytes, 1); | ||
3864 | } | ||
3865 | |||
3866 | int btrfs_block_rsv_add_noflush(struct btrfs_root *root, | ||
3867 | struct btrfs_block_rsv *block_rsv, | ||
3868 | u64 num_bytes) | ||
3869 | { | ||
3870 | return __block_rsv_add(root, block_rsv, num_bytes, 0); | ||
3871 | } | ||
3872 | |||
3873 | int btrfs_block_rsv_check(struct btrfs_root *root, | ||
3874 | struct btrfs_block_rsv *block_rsv, int min_factor) | ||
3710 | { | 3875 | { |
3711 | u64 num_bytes = 0; | 3876 | u64 num_bytes = 0; |
3712 | int commit_trans = 0; | ||
3713 | int ret = -ENOSPC; | 3877 | int ret = -ENOSPC; |
3714 | 3878 | ||
3715 | if (!block_rsv) | 3879 | if (!block_rsv) |
3716 | return 0; | 3880 | return 0; |
3717 | 3881 | ||
3718 | spin_lock(&block_rsv->lock); | 3882 | spin_lock(&block_rsv->lock); |
3719 | if (min_factor > 0) | 3883 | num_bytes = div_factor(block_rsv->size, min_factor); |
3720 | num_bytes = div_factor(block_rsv->size, min_factor); | 3884 | if (block_rsv->reserved >= num_bytes) |
3721 | if (min_reserved > num_bytes) | 3885 | ret = 0; |
3722 | num_bytes = min_reserved; | 3886 | spin_unlock(&block_rsv->lock); |
3723 | 3887 | ||
3724 | if (block_rsv->reserved >= num_bytes) { | 3888 | return ret; |
3889 | } | ||
3890 | |||
3891 | static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, | ||
3892 | struct btrfs_block_rsv *block_rsv, | ||
3893 | u64 min_reserved, int flush) | ||
3894 | { | ||
3895 | u64 num_bytes = 0; | ||
3896 | int ret = -ENOSPC; | ||
3897 | |||
3898 | if (!block_rsv) | ||
3899 | return 0; | ||
3900 | |||
3901 | spin_lock(&block_rsv->lock); | ||
3902 | num_bytes = min_reserved; | ||
3903 | if (block_rsv->reserved >= num_bytes) | ||
3725 | ret = 0; | 3904 | ret = 0; |
3726 | } else { | 3905 | else |
3727 | num_bytes -= block_rsv->reserved; | 3906 | num_bytes -= block_rsv->reserved; |
3728 | if (block_rsv->durable && | ||
3729 | block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) | ||
3730 | commit_trans = 1; | ||
3731 | } | ||
3732 | spin_unlock(&block_rsv->lock); | 3907 | spin_unlock(&block_rsv->lock); |
3908 | |||
3733 | if (!ret) | 3909 | if (!ret) |
3734 | return 0; | 3910 | return 0; |
3735 | 3911 | ||
3736 | if (block_rsv->refill_used) { | 3912 | ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); |
3737 | ret = reserve_metadata_bytes(trans, root, block_rsv, | 3913 | if (!ret) { |
3738 | num_bytes, 0); | 3914 | block_rsv_add_bytes(block_rsv, num_bytes, 0); |
3739 | if (!ret) { | ||
3740 | block_rsv_add_bytes(block_rsv, num_bytes, 0); | ||
3741 | return 0; | ||
3742 | } | ||
3743 | } | ||
3744 | |||
3745 | if (commit_trans) { | ||
3746 | if (trans) | ||
3747 | return -EAGAIN; | ||
3748 | trans = btrfs_join_transaction(root); | ||
3749 | BUG_ON(IS_ERR(trans)); | ||
3750 | ret = btrfs_commit_transaction(trans, root); | ||
3751 | return 0; | 3915 | return 0; |
3752 | } | 3916 | } |
3753 | 3917 | ||
3754 | return -ENOSPC; | 3918 | return ret; |
3919 | } | ||
3920 | |||
3921 | int btrfs_block_rsv_refill(struct btrfs_root *root, | ||
3922 | struct btrfs_block_rsv *block_rsv, | ||
3923 | u64 min_reserved) | ||
3924 | { | ||
3925 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); | ||
3926 | } | ||
3927 | |||
3928 | int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, | ||
3929 | struct btrfs_block_rsv *block_rsv, | ||
3930 | u64 min_reserved) | ||
3931 | { | ||
3932 | return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); | ||
3755 | } | 3933 | } |
3756 | 3934 | ||
3757 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, | 3935 | int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, |
@@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) | |||
3783 | u64 num_bytes; | 3961 | u64 num_bytes; |
3784 | u64 meta_used; | 3962 | u64 meta_used; |
3785 | u64 data_used; | 3963 | u64 data_used; |
3786 | int csum_size = btrfs_super_csum_size(&fs_info->super_copy); | 3964 | int csum_size = btrfs_super_csum_size(fs_info->super_copy); |
3787 | 3965 | ||
3788 | sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); | 3966 | sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); |
3789 | spin_lock(&sinfo->lock); | 3967 | spin_lock(&sinfo->lock); |
@@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
3827 | if (sinfo->total_bytes > num_bytes) { | 4005 | if (sinfo->total_bytes > num_bytes) { |
3828 | num_bytes = sinfo->total_bytes - num_bytes; | 4006 | num_bytes = sinfo->total_bytes - num_bytes; |
3829 | block_rsv->reserved += num_bytes; | 4007 | block_rsv->reserved += num_bytes; |
3830 | sinfo->bytes_reserved += num_bytes; | 4008 | sinfo->bytes_may_use += num_bytes; |
3831 | } | 4009 | } |
3832 | 4010 | ||
3833 | if (block_rsv->reserved >= block_rsv->size) { | 4011 | if (block_rsv->reserved >= block_rsv->size) { |
3834 | num_bytes = block_rsv->reserved - block_rsv->size; | 4012 | num_bytes = block_rsv->reserved - block_rsv->size; |
3835 | sinfo->bytes_reserved -= num_bytes; | 4013 | sinfo->bytes_may_use -= num_bytes; |
3836 | sinfo->reservation_progress++; | 4014 | sinfo->reservation_progress++; |
3837 | block_rsv->reserved = block_rsv->size; | 4015 | block_rsv->reserved = block_rsv->size; |
3838 | block_rsv->full = 1; | 4016 | block_rsv->full = 1; |
@@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
3848 | 4026 | ||
3849 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); | 4027 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); |
3850 | fs_info->chunk_block_rsv.space_info = space_info; | 4028 | fs_info->chunk_block_rsv.space_info = space_info; |
3851 | fs_info->chunk_block_rsv.priority = 10; | ||
3852 | 4029 | ||
3853 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); | 4030 | space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); |
3854 | fs_info->global_block_rsv.space_info = space_info; | 4031 | fs_info->global_block_rsv.space_info = space_info; |
3855 | fs_info->global_block_rsv.priority = 10; | ||
3856 | fs_info->global_block_rsv.refill_used = 1; | ||
3857 | fs_info->delalloc_block_rsv.space_info = space_info; | 4032 | fs_info->delalloc_block_rsv.space_info = space_info; |
3858 | fs_info->trans_block_rsv.space_info = space_info; | 4033 | fs_info->trans_block_rsv.space_info = space_info; |
3859 | fs_info->empty_block_rsv.space_info = space_info; | 4034 | fs_info->empty_block_rsv.space_info = space_info; |
3860 | fs_info->empty_block_rsv.priority = 10; | 4035 | fs_info->delayed_block_rsv.space_info = space_info; |
3861 | 4036 | ||
3862 | fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; | 4037 | fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; |
3863 | fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; | 4038 | fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; |
@@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
3865 | fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; | 4040 | fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; |
3866 | fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; | 4041 | fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; |
3867 | 4042 | ||
3868 | btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); | ||
3869 | |||
3870 | btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); | ||
3871 | |||
3872 | update_global_block_rsv(fs_info); | 4043 | update_global_block_rsv(fs_info); |
3873 | } | 4044 | } |
3874 | 4045 | ||
@@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) | |||
3881 | WARN_ON(fs_info->trans_block_rsv.reserved > 0); | 4052 | WARN_ON(fs_info->trans_block_rsv.reserved > 0); |
3882 | WARN_ON(fs_info->chunk_block_rsv.size > 0); | 4053 | WARN_ON(fs_info->chunk_block_rsv.size > 0); |
3883 | WARN_ON(fs_info->chunk_block_rsv.reserved > 0); | 4054 | WARN_ON(fs_info->chunk_block_rsv.reserved > 0); |
3884 | } | 4055 | WARN_ON(fs_info->delayed_block_rsv.size > 0); |
3885 | 4056 | WARN_ON(fs_info->delayed_block_rsv.reserved > 0); | |
3886 | int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, | ||
3887 | struct btrfs_root *root, | ||
3888 | struct btrfs_block_rsv *rsv) | ||
3889 | { | ||
3890 | struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; | ||
3891 | u64 num_bytes; | ||
3892 | int ret; | ||
3893 | |||
3894 | /* | ||
3895 | * Truncate should be freeing data, but give us 2 items just in case it | ||
3896 | * needs to use some space. We may want to be smarter about this in the | ||
3897 | * future. | ||
3898 | */ | ||
3899 | num_bytes = btrfs_calc_trans_metadata_size(root, 2); | ||
3900 | |||
3901 | /* We already have enough bytes, just return */ | ||
3902 | if (rsv->reserved >= num_bytes) | ||
3903 | return 0; | ||
3904 | |||
3905 | num_bytes -= rsv->reserved; | ||
3906 | |||
3907 | /* | ||
3908 | * You should have reserved enough space before hand to do this, so this | ||
3909 | * should not fail. | ||
3910 | */ | ||
3911 | ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); | ||
3912 | BUG_ON(ret); | ||
3913 | |||
3914 | return 0; | ||
3915 | } | 4057 | } |
3916 | 4058 | ||
3917 | void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | 4059 | void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, |
@@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | |||
3920 | if (!trans->bytes_reserved) | 4062 | if (!trans->bytes_reserved) |
3921 | return; | 4063 | return; |
3922 | 4064 | ||
3923 | BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); | 4065 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); |
3924 | btrfs_block_rsv_release(root, trans->block_rsv, | ||
3925 | trans->bytes_reserved); | ||
3926 | trans->bytes_reserved = 0; | 4066 | trans->bytes_reserved = 0; |
3927 | } | 4067 | } |
3928 | 4068 | ||
@@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, | |||
3964 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); | 4104 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); |
3965 | } | 4105 | } |
3966 | 4106 | ||
4107 | /** | ||
4108 | * drop_outstanding_extent - drop an outstanding extent | ||
4109 | * @inode: the inode we're dropping the extent for | ||
4110 | * | ||
4111 | * This is called when we are freeing up an outstanding extent, either called | ||
4112 | * after an error or after an extent is written. This will return the number of | ||
4113 | * reserved extents that need to be freed. This must be called with | ||
4114 | * BTRFS_I(inode)->lock held. | ||
4115 | */ | ||
3967 | static unsigned drop_outstanding_extent(struct inode *inode) | 4116 | static unsigned drop_outstanding_extent(struct inode *inode) |
3968 | { | 4117 | { |
4118 | unsigned drop_inode_space = 0; | ||
3969 | unsigned dropped_extents = 0; | 4119 | unsigned dropped_extents = 0; |
3970 | 4120 | ||
3971 | spin_lock(&BTRFS_I(inode)->lock); | ||
3972 | BUG_ON(!BTRFS_I(inode)->outstanding_extents); | 4121 | BUG_ON(!BTRFS_I(inode)->outstanding_extents); |
3973 | BTRFS_I(inode)->outstanding_extents--; | 4122 | BTRFS_I(inode)->outstanding_extents--; |
3974 | 4123 | ||
4124 | if (BTRFS_I(inode)->outstanding_extents == 0 && | ||
4125 | BTRFS_I(inode)->delalloc_meta_reserved) { | ||
4126 | drop_inode_space = 1; | ||
4127 | BTRFS_I(inode)->delalloc_meta_reserved = 0; | ||
4128 | } | ||
4129 | |||
3975 | /* | 4130 | /* |
3976 | * If we have more or the same amount of outsanding extents than we have | 4131 | * If we have more or the same amount of outsanding extents than we have |
3977 | * reserved then we need to leave the reserved extents count alone. | 4132 | * reserved then we need to leave the reserved extents count alone. |
3978 | */ | 4133 | */ |
3979 | if (BTRFS_I(inode)->outstanding_extents >= | 4134 | if (BTRFS_I(inode)->outstanding_extents >= |
3980 | BTRFS_I(inode)->reserved_extents) | 4135 | BTRFS_I(inode)->reserved_extents) |
3981 | goto out; | 4136 | return drop_inode_space; |
3982 | 4137 | ||
3983 | dropped_extents = BTRFS_I(inode)->reserved_extents - | 4138 | dropped_extents = BTRFS_I(inode)->reserved_extents - |
3984 | BTRFS_I(inode)->outstanding_extents; | 4139 | BTRFS_I(inode)->outstanding_extents; |
3985 | BTRFS_I(inode)->reserved_extents -= dropped_extents; | 4140 | BTRFS_I(inode)->reserved_extents -= dropped_extents; |
3986 | out: | 4141 | return dropped_extents + drop_inode_space; |
3987 | spin_unlock(&BTRFS_I(inode)->lock); | ||
3988 | return dropped_extents; | ||
3989 | } | 4142 | } |
3990 | 4143 | ||
3991 | static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) | 4144 | /** |
4145 | * calc_csum_metadata_size - return the amount of metada space that must be | ||
4146 | * reserved/free'd for the given bytes. | ||
4147 | * @inode: the inode we're manipulating | ||
4148 | * @num_bytes: the number of bytes in question | ||
4149 | * @reserve: 1 if we are reserving space, 0 if we are freeing space | ||
4150 | * | ||
4151 | * This adjusts the number of csum_bytes in the inode and then returns the | ||
4152 | * correct amount of metadata that must either be reserved or freed. We | ||
4153 | * calculate how many checksums we can fit into one leaf and then divide the | ||
4154 | * number of bytes that will need to be checksumed by this value to figure out | ||
4155 | * how many checksums will be required. If we are adding bytes then the number | ||
4156 | * may go up and we will return the number of additional bytes that must be | ||
4157 | * reserved. If it is going down we will return the number of bytes that must | ||
4158 | * be freed. | ||
4159 | * | ||
4160 | * This must be called with BTRFS_I(inode)->lock held. | ||
4161 | */ | ||
4162 | static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, | ||
4163 | int reserve) | ||
3992 | { | 4164 | { |
3993 | return num_bytes >>= 3; | 4165 | struct btrfs_root *root = BTRFS_I(inode)->root; |
4166 | u64 csum_size; | ||
4167 | int num_csums_per_leaf; | ||
4168 | int num_csums; | ||
4169 | int old_csums; | ||
4170 | |||
4171 | if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && | ||
4172 | BTRFS_I(inode)->csum_bytes == 0) | ||
4173 | return 0; | ||
4174 | |||
4175 | old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); | ||
4176 | if (reserve) | ||
4177 | BTRFS_I(inode)->csum_bytes += num_bytes; | ||
4178 | else | ||
4179 | BTRFS_I(inode)->csum_bytes -= num_bytes; | ||
4180 | csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); | ||
4181 | num_csums_per_leaf = (int)div64_u64(csum_size, | ||
4182 | sizeof(struct btrfs_csum_item) + | ||
4183 | sizeof(struct btrfs_disk_key)); | ||
4184 | num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); | ||
4185 | num_csums = num_csums + num_csums_per_leaf - 1; | ||
4186 | num_csums = num_csums / num_csums_per_leaf; | ||
4187 | |||
4188 | old_csums = old_csums + num_csums_per_leaf - 1; | ||
4189 | old_csums = old_csums / num_csums_per_leaf; | ||
4190 | |||
4191 | /* No change, no need to reserve more */ | ||
4192 | if (old_csums == num_csums) | ||
4193 | return 0; | ||
4194 | |||
4195 | if (reserve) | ||
4196 | return btrfs_calc_trans_metadata_size(root, | ||
4197 | num_csums - old_csums); | ||
4198 | |||
4199 | return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); | ||
3994 | } | 4200 | } |
3995 | 4201 | ||
3996 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | 4202 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) |
@@ -3999,9 +4205,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
3999 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; | 4205 | struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; |
4000 | u64 to_reserve = 0; | 4206 | u64 to_reserve = 0; |
4001 | unsigned nr_extents = 0; | 4207 | unsigned nr_extents = 0; |
4208 | int flush = 1; | ||
4002 | int ret; | 4209 | int ret; |
4003 | 4210 | ||
4004 | if (btrfs_transaction_in_commit(root->fs_info)) | 4211 | if (btrfs_is_free_space_inode(root, inode)) |
4212 | flush = 0; | ||
4213 | |||
4214 | if (flush && btrfs_transaction_in_commit(root->fs_info)) | ||
4005 | schedule_timeout(1); | 4215 | schedule_timeout(1); |
4006 | 4216 | ||
4007 | num_bytes = ALIGN(num_bytes, root->sectorsize); | 4217 | num_bytes = ALIGN(num_bytes, root->sectorsize); |
@@ -4014,21 +4224,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4014 | nr_extents = BTRFS_I(inode)->outstanding_extents - | 4224 | nr_extents = BTRFS_I(inode)->outstanding_extents - |
4015 | BTRFS_I(inode)->reserved_extents; | 4225 | BTRFS_I(inode)->reserved_extents; |
4016 | BTRFS_I(inode)->reserved_extents += nr_extents; | 4226 | BTRFS_I(inode)->reserved_extents += nr_extents; |
4227 | } | ||
4017 | 4228 | ||
4018 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); | 4229 | /* |
4230 | * Add an item to reserve for updating the inode when we complete the | ||
4231 | * delalloc io. | ||
4232 | */ | ||
4233 | if (!BTRFS_I(inode)->delalloc_meta_reserved) { | ||
4234 | nr_extents++; | ||
4235 | BTRFS_I(inode)->delalloc_meta_reserved = 1; | ||
4019 | } | 4236 | } |
4237 | |||
4238 | to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); | ||
4239 | to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); | ||
4020 | spin_unlock(&BTRFS_I(inode)->lock); | 4240 | spin_unlock(&BTRFS_I(inode)->lock); |
4021 | 4241 | ||
4022 | to_reserve += calc_csum_metadata_size(inode, num_bytes); | 4242 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); |
4023 | ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); | ||
4024 | if (ret) { | 4243 | if (ret) { |
4244 | u64 to_free = 0; | ||
4025 | unsigned dropped; | 4245 | unsigned dropped; |
4246 | |||
4247 | spin_lock(&BTRFS_I(inode)->lock); | ||
4248 | dropped = drop_outstanding_extent(inode); | ||
4249 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
4250 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4251 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
4252 | |||
4026 | /* | 4253 | /* |
4027 | * We don't need the return value since our reservation failed, | 4254 | * Somebody could have come in and twiddled with the |
4028 | * we just need to clean up our counter. | 4255 | * reservation, so if we have to free more than we would have |
4256 | * reserved from this reservation go ahead and release those | ||
4257 | * bytes. | ||
4029 | */ | 4258 | */ |
4030 | dropped = drop_outstanding_extent(inode); | 4259 | to_free -= to_reserve; |
4031 | WARN_ON(dropped > 1); | 4260 | if (to_free) |
4261 | btrfs_block_rsv_release(root, block_rsv, to_free); | ||
4032 | return ret; | 4262 | return ret; |
4033 | } | 4263 | } |
4034 | 4264 | ||
@@ -4037,6 +4267,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
4037 | return 0; | 4267 | return 0; |
4038 | } | 4268 | } |
4039 | 4269 | ||
4270 | /** | ||
4271 | * btrfs_delalloc_release_metadata - release a metadata reservation for an inode | ||
4272 | * @inode: the inode to release the reservation for | ||
4273 | * @num_bytes: the number of bytes we're releasing | ||
4274 | * | ||
4275 | * This will release the metadata reservation for an inode. This can be called | ||
4276 | * once we complete IO for a given set of bytes to release their metadata | ||
4277 | * reservations. | ||
4278 | */ | ||
4040 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | 4279 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) |
4041 | { | 4280 | { |
4042 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4281 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -4044,9 +4283,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | |||
4044 | unsigned dropped; | 4283 | unsigned dropped; |
4045 | 4284 | ||
4046 | num_bytes = ALIGN(num_bytes, root->sectorsize); | 4285 | num_bytes = ALIGN(num_bytes, root->sectorsize); |
4286 | spin_lock(&BTRFS_I(inode)->lock); | ||
4047 | dropped = drop_outstanding_extent(inode); | 4287 | dropped = drop_outstanding_extent(inode); |
4048 | 4288 | ||
4049 | to_free = calc_csum_metadata_size(inode, num_bytes); | 4289 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); |
4290 | spin_unlock(&BTRFS_I(inode)->lock); | ||
4050 | if (dropped > 0) | 4291 | if (dropped > 0) |
4051 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | 4292 | to_free += btrfs_calc_trans_metadata_size(root, dropped); |
4052 | 4293 | ||
@@ -4054,6 +4295,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | |||
4054 | to_free); | 4295 | to_free); |
4055 | } | 4296 | } |
4056 | 4297 | ||
4298 | /** | ||
4299 | * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc | ||
4300 | * @inode: inode we're writing to | ||
4301 | * @num_bytes: the number of bytes we want to allocate | ||
4302 | * | ||
4303 | * This will do the following things | ||
4304 | * | ||
4305 | * o reserve space in the data space info for num_bytes | ||
4306 | * o reserve space in the metadata space info based on number of outstanding | ||
4307 | * extents and how much csums will be needed | ||
4308 | * o add to the inodes ->delalloc_bytes | ||
4309 | * o add it to the fs_info's delalloc inodes list. | ||
4310 | * | ||
4311 | * This will return 0 for success and -ENOSPC if there is no space left. | ||
4312 | */ | ||
4057 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) | 4313 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) |
4058 | { | 4314 | { |
4059 | int ret; | 4315 | int ret; |
@@ -4071,6 +4327,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) | |||
4071 | return 0; | 4327 | return 0; |
4072 | } | 4328 | } |
4073 | 4329 | ||
4330 | /** | ||
4331 | * btrfs_delalloc_release_space - release data and metadata space for delalloc | ||
4332 | * @inode: inode we're releasing space for | ||
4333 | * @num_bytes: the number of bytes we want to free up | ||
4334 | * | ||
4335 | * This must be matched with a call to btrfs_delalloc_reserve_space. This is | ||
4336 | * called in the case that we don't need the metadata AND data reservations | ||
4337 | * anymore. So if there is an error or we insert an inline extent. | ||
4338 | * | ||
4339 | * This function will release the metadata space that was not used and will | ||
4340 | * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes | ||
4341 | * list if there are no delalloc bytes left. | ||
4342 | */ | ||
4074 | void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) | 4343 | void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) |
4075 | { | 4344 | { |
4076 | btrfs_delalloc_release_metadata(inode, num_bytes); | 4345 | btrfs_delalloc_release_metadata(inode, num_bytes); |
@@ -4090,12 +4359,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
4090 | 4359 | ||
4091 | /* block accounting for super block */ | 4360 | /* block accounting for super block */ |
4092 | spin_lock(&info->delalloc_lock); | 4361 | spin_lock(&info->delalloc_lock); |
4093 | old_val = btrfs_super_bytes_used(&info->super_copy); | 4362 | old_val = btrfs_super_bytes_used(info->super_copy); |
4094 | if (alloc) | 4363 | if (alloc) |
4095 | old_val += num_bytes; | 4364 | old_val += num_bytes; |
4096 | else | 4365 | else |
4097 | old_val -= num_bytes; | 4366 | old_val -= num_bytes; |
4098 | btrfs_set_super_bytes_used(&info->super_copy, old_val); | 4367 | btrfs_set_super_bytes_used(info->super_copy, old_val); |
4099 | spin_unlock(&info->delalloc_lock); | 4368 | spin_unlock(&info->delalloc_lock); |
4100 | 4369 | ||
4101 | while (total) { | 4370 | while (total) { |
@@ -4123,7 +4392,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
4123 | spin_lock(&cache->space_info->lock); | 4392 | spin_lock(&cache->space_info->lock); |
4124 | spin_lock(&cache->lock); | 4393 | spin_lock(&cache->lock); |
4125 | 4394 | ||
4126 | if (btrfs_super_cache_generation(&info->super_copy) != 0 && | 4395 | if (btrfs_test_opt(root, SPACE_CACHE) && |
4127 | cache->disk_cache_state < BTRFS_DC_CLEAR) | 4396 | cache->disk_cache_state < BTRFS_DC_CLEAR) |
4128 | cache->disk_cache_state = BTRFS_DC_CLEAR; | 4397 | cache->disk_cache_state = BTRFS_DC_CLEAR; |
4129 | 4398 | ||
@@ -4135,7 +4404,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
4135 | btrfs_set_block_group_used(&cache->item, old_val); | 4404 | btrfs_set_block_group_used(&cache->item, old_val); |
4136 | cache->reserved -= num_bytes; | 4405 | cache->reserved -= num_bytes; |
4137 | cache->space_info->bytes_reserved -= num_bytes; | 4406 | cache->space_info->bytes_reserved -= num_bytes; |
4138 | cache->space_info->reservation_progress++; | ||
4139 | cache->space_info->bytes_used += num_bytes; | 4407 | cache->space_info->bytes_used += num_bytes; |
4140 | cache->space_info->disk_used += num_bytes * factor; | 4408 | cache->space_info->disk_used += num_bytes * factor; |
4141 | spin_unlock(&cache->lock); | 4409 | spin_unlock(&cache->lock); |
@@ -4187,7 +4455,6 @@ static int pin_down_extent(struct btrfs_root *root, | |||
4187 | if (reserved) { | 4455 | if (reserved) { |
4188 | cache->reserved -= num_bytes; | 4456 | cache->reserved -= num_bytes; |
4189 | cache->space_info->bytes_reserved -= num_bytes; | 4457 | cache->space_info->bytes_reserved -= num_bytes; |
4190 | cache->space_info->reservation_progress++; | ||
4191 | } | 4458 | } |
4192 | spin_unlock(&cache->lock); | 4459 | spin_unlock(&cache->lock); |
4193 | spin_unlock(&cache->space_info->lock); | 4460 | spin_unlock(&cache->space_info->lock); |
@@ -4215,45 +4482,82 @@ int btrfs_pin_extent(struct btrfs_root *root, | |||
4215 | } | 4482 | } |
4216 | 4483 | ||
4217 | /* | 4484 | /* |
4218 | * update size of reserved extents. this function may return -EAGAIN | 4485 | * this function must be called within transaction |
4219 | * if 'reserve' is true or 'sinfo' is false. | 4486 | */ |
4487 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | ||
4488 | struct btrfs_root *root, | ||
4489 | u64 bytenr, u64 num_bytes) | ||
4490 | { | ||
4491 | struct btrfs_block_group_cache *cache; | ||
4492 | |||
4493 | cache = btrfs_lookup_block_group(root->fs_info, bytenr); | ||
4494 | BUG_ON(!cache); | ||
4495 | |||
4496 | /* | ||
4497 | * pull in the free space cache (if any) so that our pin | ||
4498 | * removes the free space from the cache. We have load_only set | ||
4499 | * to one because the slow code to read in the free extents does check | ||
4500 | * the pinned extents. | ||
4501 | */ | ||
4502 | cache_block_group(cache, trans, root, 1); | ||
4503 | |||
4504 | pin_down_extent(root, cache, bytenr, num_bytes, 0); | ||
4505 | |||
4506 | /* remove us from the free space cache (if we're there at all) */ | ||
4507 | btrfs_remove_free_space(cache, bytenr, num_bytes); | ||
4508 | btrfs_put_block_group(cache); | ||
4509 | return 0; | ||
4510 | } | ||
4511 | |||
4512 | /** | ||
4513 | * btrfs_update_reserved_bytes - update the block_group and space info counters | ||
4514 | * @cache: The cache we are manipulating | ||
4515 | * @num_bytes: The number of bytes in question | ||
4516 | * @reserve: One of the reservation enums | ||
4517 | * | ||
4518 | * This is called by the allocator when it reserves space, or by somebody who is | ||
4519 | * freeing space that was never actually used on disk. For example if you | ||
4520 | * reserve some space for a new leaf in transaction A and before transaction A | ||
4521 | * commits you free that leaf, you call this with reserve set to 0 in order to | ||
4522 | * clear the reservation. | ||
4523 | * | ||
4524 | * Metadata reservations should be called with RESERVE_ALLOC so we do the proper | ||
4525 | * ENOSPC accounting. For data we handle the reservation through clearing the | ||
4526 | * delalloc bits in the io_tree. We have to do this since we could end up | ||
4527 | * allocating less disk space for the amount of data we have reserved in the | ||
4528 | * case of compression. | ||
4529 | * | ||
4530 | * If this is a reservation and the block group has become read only we cannot | ||
4531 | * make the reservation and return -EAGAIN, otherwise this function always | ||
4532 | * succeeds. | ||
4220 | */ | 4533 | */ |
4221 | int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | 4534 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, |
4222 | u64 num_bytes, int reserve, int sinfo) | 4535 | u64 num_bytes, int reserve) |
4223 | { | 4536 | { |
4537 | struct btrfs_space_info *space_info = cache->space_info; | ||
4224 | int ret = 0; | 4538 | int ret = 0; |
4225 | if (sinfo) { | 4539 | spin_lock(&space_info->lock); |
4226 | struct btrfs_space_info *space_info = cache->space_info; | 4540 | spin_lock(&cache->lock); |
4227 | spin_lock(&space_info->lock); | 4541 | if (reserve != RESERVE_FREE) { |
4228 | spin_lock(&cache->lock); | ||
4229 | if (reserve) { | ||
4230 | if (cache->ro) { | ||
4231 | ret = -EAGAIN; | ||
4232 | } else { | ||
4233 | cache->reserved += num_bytes; | ||
4234 | space_info->bytes_reserved += num_bytes; | ||
4235 | } | ||
4236 | } else { | ||
4237 | if (cache->ro) | ||
4238 | space_info->bytes_readonly += num_bytes; | ||
4239 | cache->reserved -= num_bytes; | ||
4240 | space_info->bytes_reserved -= num_bytes; | ||
4241 | space_info->reservation_progress++; | ||
4242 | } | ||
4243 | spin_unlock(&cache->lock); | ||
4244 | spin_unlock(&space_info->lock); | ||
4245 | } else { | ||
4246 | spin_lock(&cache->lock); | ||
4247 | if (cache->ro) { | 4542 | if (cache->ro) { |
4248 | ret = -EAGAIN; | 4543 | ret = -EAGAIN; |
4249 | } else { | 4544 | } else { |
4250 | if (reserve) | 4545 | cache->reserved += num_bytes; |
4251 | cache->reserved += num_bytes; | 4546 | space_info->bytes_reserved += num_bytes; |
4252 | else | 4547 | if (reserve == RESERVE_ALLOC) { |
4253 | cache->reserved -= num_bytes; | 4548 | BUG_ON(space_info->bytes_may_use < num_bytes); |
4549 | space_info->bytes_may_use -= num_bytes; | ||
4550 | } | ||
4254 | } | 4551 | } |
4255 | spin_unlock(&cache->lock); | 4552 | } else { |
4553 | if (cache->ro) | ||
4554 | space_info->bytes_readonly += num_bytes; | ||
4555 | cache->reserved -= num_bytes; | ||
4556 | space_info->bytes_reserved -= num_bytes; | ||
4557 | space_info->reservation_progress++; | ||
4256 | } | 4558 | } |
4559 | spin_unlock(&cache->lock); | ||
4560 | spin_unlock(&space_info->lock); | ||
4257 | return ret; | 4561 | return ret; |
4258 | } | 4562 | } |
4259 | 4563 | ||
@@ -4319,13 +4623,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) | |||
4319 | spin_lock(&cache->lock); | 4623 | spin_lock(&cache->lock); |
4320 | cache->pinned -= len; | 4624 | cache->pinned -= len; |
4321 | cache->space_info->bytes_pinned -= len; | 4625 | cache->space_info->bytes_pinned -= len; |
4322 | if (cache->ro) { | 4626 | if (cache->ro) |
4323 | cache->space_info->bytes_readonly += len; | 4627 | cache->space_info->bytes_readonly += len; |
4324 | } else if (cache->reserved_pinned > 0) { | ||
4325 | len = min(len, cache->reserved_pinned); | ||
4326 | cache->reserved_pinned -= len; | ||
4327 | cache->space_info->bytes_reserved += len; | ||
4328 | } | ||
4329 | spin_unlock(&cache->lock); | 4628 | spin_unlock(&cache->lock); |
4330 | spin_unlock(&cache->space_info->lock); | 4629 | spin_unlock(&cache->space_info->lock); |
4331 | } | 4630 | } |
@@ -4340,11 +4639,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | |||
4340 | { | 4639 | { |
4341 | struct btrfs_fs_info *fs_info = root->fs_info; | 4640 | struct btrfs_fs_info *fs_info = root->fs_info; |
4342 | struct extent_io_tree *unpin; | 4641 | struct extent_io_tree *unpin; |
4343 | struct btrfs_block_rsv *block_rsv; | ||
4344 | struct btrfs_block_rsv *next_rsv; | ||
4345 | u64 start; | 4642 | u64 start; |
4346 | u64 end; | 4643 | u64 end; |
4347 | int idx; | ||
4348 | int ret; | 4644 | int ret; |
4349 | 4645 | ||
4350 | if (fs_info->pinned_extents == &fs_info->freed_extents[0]) | 4646 | if (fs_info->pinned_extents == &fs_info->freed_extents[0]) |
@@ -4367,30 +4663,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, | |||
4367 | cond_resched(); | 4663 | cond_resched(); |
4368 | } | 4664 | } |
4369 | 4665 | ||
4370 | mutex_lock(&fs_info->durable_block_rsv_mutex); | ||
4371 | list_for_each_entry_safe(block_rsv, next_rsv, | ||
4372 | &fs_info->durable_block_rsv_list, list) { | ||
4373 | |||
4374 | idx = trans->transid & 0x1; | ||
4375 | if (block_rsv->freed[idx] > 0) { | ||
4376 | block_rsv_add_bytes(block_rsv, | ||
4377 | block_rsv->freed[idx], 0); | ||
4378 | block_rsv->freed[idx] = 0; | ||
4379 | } | ||
4380 | if (atomic_read(&block_rsv->usage) == 0) { | ||
4381 | btrfs_block_rsv_release(root, block_rsv, (u64)-1); | ||
4382 | |||
4383 | if (block_rsv->freed[0] == 0 && | ||
4384 | block_rsv->freed[1] == 0) { | ||
4385 | list_del_init(&block_rsv->list); | ||
4386 | kfree(block_rsv); | ||
4387 | } | ||
4388 | } else { | ||
4389 | btrfs_block_rsv_release(root, block_rsv, 0); | ||
4390 | } | ||
4391 | } | ||
4392 | mutex_unlock(&fs_info->durable_block_rsv_mutex); | ||
4393 | |||
4394 | return 0; | 4666 | return 0; |
4395 | } | 4667 | } |
4396 | 4668 | ||
@@ -4668,7 +4940,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, | |||
4668 | struct extent_buffer *buf, | 4940 | struct extent_buffer *buf, |
4669 | u64 parent, int last_ref) | 4941 | u64 parent, int last_ref) |
4670 | { | 4942 | { |
4671 | struct btrfs_block_rsv *block_rsv; | ||
4672 | struct btrfs_block_group_cache *cache = NULL; | 4943 | struct btrfs_block_group_cache *cache = NULL; |
4673 | int ret; | 4944 | int ret; |
4674 | 4945 | ||
@@ -4683,64 +4954,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, | |||
4683 | if (!last_ref) | 4954 | if (!last_ref) |
4684 | return; | 4955 | return; |
4685 | 4956 | ||
4686 | block_rsv = get_block_rsv(trans, root); | ||
4687 | cache = btrfs_lookup_block_group(root->fs_info, buf->start); | 4957 | cache = btrfs_lookup_block_group(root->fs_info, buf->start); |
4688 | if (block_rsv->space_info != cache->space_info) | ||
4689 | goto out; | ||
4690 | 4958 | ||
4691 | if (btrfs_header_generation(buf) == trans->transid) { | 4959 | if (btrfs_header_generation(buf) == trans->transid) { |
4692 | if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { | 4960 | if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { |
4693 | ret = check_ref_cleanup(trans, root, buf->start); | 4961 | ret = check_ref_cleanup(trans, root, buf->start); |
4694 | if (!ret) | 4962 | if (!ret) |
4695 | goto pin; | 4963 | goto out; |
4696 | } | 4964 | } |
4697 | 4965 | ||
4698 | if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { | 4966 | if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { |
4699 | pin_down_extent(root, cache, buf->start, buf->len, 1); | 4967 | pin_down_extent(root, cache, buf->start, buf->len, 1); |
4700 | goto pin; | 4968 | goto out; |
4701 | } | 4969 | } |
4702 | 4970 | ||
4703 | WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); | 4971 | WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); |
4704 | 4972 | ||
4705 | btrfs_add_free_space(cache, buf->start, buf->len); | 4973 | btrfs_add_free_space(cache, buf->start, buf->len); |
4706 | ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); | 4974 | btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE); |
4707 | if (ret == -EAGAIN) { | ||
4708 | /* block group became read-only */ | ||
4709 | btrfs_update_reserved_bytes(cache, buf->len, 0, 1); | ||
4710 | goto out; | ||
4711 | } | ||
4712 | |||
4713 | ret = 1; | ||
4714 | spin_lock(&block_rsv->lock); | ||
4715 | if (block_rsv->reserved < block_rsv->size) { | ||
4716 | block_rsv->reserved += buf->len; | ||
4717 | ret = 0; | ||
4718 | } | ||
4719 | spin_unlock(&block_rsv->lock); | ||
4720 | |||
4721 | if (ret) { | ||
4722 | spin_lock(&cache->space_info->lock); | ||
4723 | cache->space_info->bytes_reserved -= buf->len; | ||
4724 | cache->space_info->reservation_progress++; | ||
4725 | spin_unlock(&cache->space_info->lock); | ||
4726 | } | ||
4727 | goto out; | ||
4728 | } | ||
4729 | pin: | ||
4730 | if (block_rsv->durable && !cache->ro) { | ||
4731 | ret = 0; | ||
4732 | spin_lock(&cache->lock); | ||
4733 | if (!cache->ro) { | ||
4734 | cache->reserved_pinned += buf->len; | ||
4735 | ret = 1; | ||
4736 | } | ||
4737 | spin_unlock(&cache->lock); | ||
4738 | |||
4739 | if (ret) { | ||
4740 | spin_lock(&block_rsv->lock); | ||
4741 | block_rsv->freed[trans->transid & 0x1] += buf->len; | ||
4742 | spin_unlock(&block_rsv->lock); | ||
4743 | } | ||
4744 | } | 4975 | } |
4745 | out: | 4976 | out: |
4746 | /* | 4977 | /* |
@@ -4876,17 +5107,20 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, | |||
4876 | struct btrfs_root *root = orig_root->fs_info->extent_root; | 5107 | struct btrfs_root *root = orig_root->fs_info->extent_root; |
4877 | struct btrfs_free_cluster *last_ptr = NULL; | 5108 | struct btrfs_free_cluster *last_ptr = NULL; |
4878 | struct btrfs_block_group_cache *block_group = NULL; | 5109 | struct btrfs_block_group_cache *block_group = NULL; |
5110 | struct btrfs_block_group_cache *used_block_group; | ||
4879 | int empty_cluster = 2 * 1024 * 1024; | 5111 | int empty_cluster = 2 * 1024 * 1024; |
4880 | int allowed_chunk_alloc = 0; | 5112 | int allowed_chunk_alloc = 0; |
4881 | int done_chunk_alloc = 0; | 5113 | int done_chunk_alloc = 0; |
4882 | struct btrfs_space_info *space_info; | 5114 | struct btrfs_space_info *space_info; |
4883 | int last_ptr_loop = 0; | ||
4884 | int loop = 0; | 5115 | int loop = 0; |
4885 | int index = 0; | 5116 | int index = 0; |
5117 | int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? | ||
5118 | RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; | ||
4886 | bool found_uncached_bg = false; | 5119 | bool found_uncached_bg = false; |
4887 | bool failed_cluster_refill = false; | 5120 | bool failed_cluster_refill = false; |
4888 | bool failed_alloc = false; | 5121 | bool failed_alloc = false; |
4889 | bool use_cluster = true; | 5122 | bool use_cluster = true; |
5123 | bool have_caching_bg = false; | ||
4890 | u64 ideal_cache_percent = 0; | 5124 | u64 ideal_cache_percent = 0; |
4891 | u64 ideal_cache_offset = 0; | 5125 | u64 ideal_cache_offset = 0; |
4892 | 5126 | ||
@@ -4939,6 +5173,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, | |||
4939 | ideal_cache: | 5173 | ideal_cache: |
4940 | block_group = btrfs_lookup_block_group(root->fs_info, | 5174 | block_group = btrfs_lookup_block_group(root->fs_info, |
4941 | search_start); | 5175 | search_start); |
5176 | used_block_group = block_group; | ||
4942 | /* | 5177 | /* |
4943 | * we don't want to use the block group if it doesn't match our | 5178 | * we don't want to use the block group if it doesn't match our |
4944 | * allocation bits, or if its not cached. | 5179 | * allocation bits, or if its not cached. |
@@ -4969,12 +5204,14 @@ ideal_cache: | |||
4969 | } | 5204 | } |
4970 | } | 5205 | } |
4971 | search: | 5206 | search: |
5207 | have_caching_bg = false; | ||
4972 | down_read(&space_info->groups_sem); | 5208 | down_read(&space_info->groups_sem); |
4973 | list_for_each_entry(block_group, &space_info->block_groups[index], | 5209 | list_for_each_entry(block_group, &space_info->block_groups[index], |
4974 | list) { | 5210 | list) { |
4975 | u64 offset; | 5211 | u64 offset; |
4976 | int cached; | 5212 | int cached; |
4977 | 5213 | ||
5214 | used_block_group = block_group; | ||
4978 | btrfs_get_block_group(block_group); | 5215 | btrfs_get_block_group(block_group); |
4979 | search_start = block_group->key.objectid; | 5216 | search_start = block_group->key.objectid; |
4980 | 5217 | ||
@@ -4998,13 +5235,15 @@ search: | |||
4998 | } | 5235 | } |
4999 | 5236 | ||
5000 | have_block_group: | 5237 | have_block_group: |
5001 | if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { | 5238 | cached = block_group_cache_done(block_group); |
5239 | if (unlikely(!cached)) { | ||
5002 | u64 free_percent; | 5240 | u64 free_percent; |
5003 | 5241 | ||
5242 | found_uncached_bg = true; | ||
5004 | ret = cache_block_group(block_group, trans, | 5243 | ret = cache_block_group(block_group, trans, |
5005 | orig_root, 1); | 5244 | orig_root, 1); |
5006 | if (block_group->cached == BTRFS_CACHE_FINISHED) | 5245 | if (block_group->cached == BTRFS_CACHE_FINISHED) |
5007 | goto have_block_group; | 5246 | goto alloc; |
5008 | 5247 | ||
5009 | free_percent = btrfs_block_group_used(&block_group->item); | 5248 | free_percent = btrfs_block_group_used(&block_group->item); |
5010 | free_percent *= 100; | 5249 | free_percent *= 100; |
@@ -5026,7 +5265,6 @@ have_block_group: | |||
5026 | orig_root, 0); | 5265 | orig_root, 0); |
5027 | BUG_ON(ret); | 5266 | BUG_ON(ret); |
5028 | } | 5267 | } |
5029 | found_uncached_bg = true; | ||
5030 | 5268 | ||
5031 | /* | 5269 | /* |
5032 | * If loop is set for cached only, try the next block | 5270 | * If loop is set for cached only, try the next block |
@@ -5036,94 +5274,80 @@ have_block_group: | |||
5036 | goto loop; | 5274 | goto loop; |
5037 | } | 5275 | } |
5038 | 5276 | ||
5039 | cached = block_group_cache_done(block_group); | 5277 | alloc: |
5040 | if (unlikely(!cached)) | ||
5041 | found_uncached_bg = true; | ||
5042 | |||
5043 | if (unlikely(block_group->ro)) | 5278 | if (unlikely(block_group->ro)) |
5044 | goto loop; | 5279 | goto loop; |
5045 | 5280 | ||
5046 | spin_lock(&block_group->free_space_ctl->tree_lock); | 5281 | spin_lock(&block_group->free_space_ctl->tree_lock); |
5047 | if (cached && | 5282 | if (cached && |
5048 | block_group->free_space_ctl->free_space < | 5283 | block_group->free_space_ctl->free_space < |
5049 | num_bytes + empty_size) { | 5284 | num_bytes + empty_cluster + empty_size) { |
5050 | spin_unlock(&block_group->free_space_ctl->tree_lock); | 5285 | spin_unlock(&block_group->free_space_ctl->tree_lock); |
5051 | goto loop; | 5286 | goto loop; |
5052 | } | 5287 | } |
5053 | spin_unlock(&block_group->free_space_ctl->tree_lock); | 5288 | spin_unlock(&block_group->free_space_ctl->tree_lock); |
5054 | 5289 | ||
5055 | /* | 5290 | /* |
5056 | * Ok we want to try and use the cluster allocator, so lets look | 5291 | * Ok we want to try and use the cluster allocator, so |
5057 | * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will | 5292 | * lets look there |
5058 | * have tried the cluster allocator plenty of times at this | ||
5059 | * point and not have found anything, so we are likely way too | ||
5060 | * fragmented for the clustering stuff to find anything, so lets | ||
5061 | * just skip it and let the allocator find whatever block it can | ||
5062 | * find | ||
5063 | */ | 5293 | */ |
5064 | if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { | 5294 | if (last_ptr) { |
5065 | /* | 5295 | /* |
5066 | * the refill lock keeps out other | 5296 | * the refill lock keeps out other |
5067 | * people trying to start a new cluster | 5297 | * people trying to start a new cluster |
5068 | */ | 5298 | */ |
5069 | spin_lock(&last_ptr->refill_lock); | 5299 | spin_lock(&last_ptr->refill_lock); |
5070 | if (last_ptr->block_group && | 5300 | used_block_group = last_ptr->block_group; |
5071 | (last_ptr->block_group->ro || | 5301 | if (used_block_group != block_group && |
5072 | !block_group_bits(last_ptr->block_group, data))) { | 5302 | (!used_block_group || |
5073 | offset = 0; | 5303 | used_block_group->ro || |
5304 | !block_group_bits(used_block_group, data))) { | ||
5305 | used_block_group = block_group; | ||
5074 | goto refill_cluster; | 5306 | goto refill_cluster; |
5075 | } | 5307 | } |
5076 | 5308 | ||
5077 | offset = btrfs_alloc_from_cluster(block_group, last_ptr, | 5309 | if (used_block_group != block_group) |
5078 | num_bytes, search_start); | 5310 | btrfs_get_block_group(used_block_group); |
5311 | |||
5312 | offset = btrfs_alloc_from_cluster(used_block_group, | ||
5313 | last_ptr, num_bytes, used_block_group->key.objectid); | ||
5079 | if (offset) { | 5314 | if (offset) { |
5080 | /* we have a block, we're done */ | 5315 | /* we have a block, we're done */ |
5081 | spin_unlock(&last_ptr->refill_lock); | 5316 | spin_unlock(&last_ptr->refill_lock); |
5082 | goto checks; | 5317 | goto checks; |
5083 | } | 5318 | } |
5084 | 5319 | ||
5085 | spin_lock(&last_ptr->lock); | 5320 | WARN_ON(last_ptr->block_group != used_block_group); |
5086 | /* | 5321 | if (used_block_group != block_group) { |
5087 | * whoops, this cluster doesn't actually point to | 5322 | btrfs_put_block_group(used_block_group); |
5088 | * this block group. Get a ref on the block | 5323 | used_block_group = block_group; |
5089 | * group is does point to and try again | ||
5090 | */ | ||
5091 | if (!last_ptr_loop && last_ptr->block_group && | ||
5092 | last_ptr->block_group != block_group && | ||
5093 | index <= | ||
5094 | get_block_group_index(last_ptr->block_group)) { | ||
5095 | |||
5096 | btrfs_put_block_group(block_group); | ||
5097 | block_group = last_ptr->block_group; | ||
5098 | btrfs_get_block_group(block_group); | ||
5099 | spin_unlock(&last_ptr->lock); | ||
5100 | spin_unlock(&last_ptr->refill_lock); | ||
5101 | |||
5102 | last_ptr_loop = 1; | ||
5103 | search_start = block_group->key.objectid; | ||
5104 | /* | ||
5105 | * we know this block group is properly | ||
5106 | * in the list because | ||
5107 | * btrfs_remove_block_group, drops the | ||
5108 | * cluster before it removes the block | ||
5109 | * group from the list | ||
5110 | */ | ||
5111 | goto have_block_group; | ||
5112 | } | 5324 | } |
5113 | spin_unlock(&last_ptr->lock); | ||
5114 | refill_cluster: | 5325 | refill_cluster: |
5326 | BUG_ON(used_block_group != block_group); | ||
5327 | /* If we are on LOOP_NO_EMPTY_SIZE, we can't | ||
5328 | * set up a new clusters, so lets just skip it | ||
5329 | * and let the allocator find whatever block | ||
5330 | * it can find. If we reach this point, we | ||
5331 | * will have tried the cluster allocator | ||
5332 | * plenty of times and not have found | ||
5333 | * anything, so we are likely way too | ||
5334 | * fragmented for the clustering stuff to find | ||
5335 | * anything. */ | ||
5336 | if (loop >= LOOP_NO_EMPTY_SIZE) { | ||
5337 | spin_unlock(&last_ptr->refill_lock); | ||
5338 | goto unclustered_alloc; | ||
5339 | } | ||
5340 | |||
5115 | /* | 5341 | /* |
5116 | * this cluster didn't work out, free it and | 5342 | * this cluster didn't work out, free it and |
5117 | * start over | 5343 | * start over |
5118 | */ | 5344 | */ |
5119 | btrfs_return_cluster_to_free_space(NULL, last_ptr); | 5345 | btrfs_return_cluster_to_free_space(NULL, last_ptr); |
5120 | 5346 | ||
5121 | last_ptr_loop = 0; | ||
5122 | |||
5123 | /* allocate a cluster in this block group */ | 5347 | /* allocate a cluster in this block group */ |
5124 | ret = btrfs_find_space_cluster(trans, root, | 5348 | ret = btrfs_find_space_cluster(trans, root, |
5125 | block_group, last_ptr, | 5349 | block_group, last_ptr, |
5126 | offset, num_bytes, | 5350 | search_start, num_bytes, |
5127 | empty_cluster + empty_size); | 5351 | empty_cluster + empty_size); |
5128 | if (ret == 0) { | 5352 | if (ret == 0) { |
5129 | /* | 5353 | /* |
@@ -5159,6 +5383,7 @@ refill_cluster: | |||
5159 | goto loop; | 5383 | goto loop; |
5160 | } | 5384 | } |
5161 | 5385 | ||
5386 | unclustered_alloc: | ||
5162 | offset = btrfs_find_space_for_alloc(block_group, search_start, | 5387 | offset = btrfs_find_space_for_alloc(block_group, search_start, |
5163 | num_bytes, empty_size); | 5388 | num_bytes, empty_size); |
5164 | /* | 5389 | /* |
@@ -5177,20 +5402,22 @@ refill_cluster: | |||
5177 | failed_alloc = true; | 5402 | failed_alloc = true; |
5178 | goto have_block_group; | 5403 | goto have_block_group; |
5179 | } else if (!offset) { | 5404 | } else if (!offset) { |
5405 | if (!cached) | ||
5406 | have_caching_bg = true; | ||
5180 | goto loop; | 5407 | goto loop; |
5181 | } | 5408 | } |
5182 | checks: | 5409 | checks: |
5183 | search_start = stripe_align(root, offset); | 5410 | search_start = stripe_align(root, offset); |
5184 | /* move on to the next group */ | 5411 | /* move on to the next group */ |
5185 | if (search_start + num_bytes >= search_end) { | 5412 | if (search_start + num_bytes >= search_end) { |
5186 | btrfs_add_free_space(block_group, offset, num_bytes); | 5413 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5187 | goto loop; | 5414 | goto loop; |
5188 | } | 5415 | } |
5189 | 5416 | ||
5190 | /* move on to the next group */ | 5417 | /* move on to the next group */ |
5191 | if (search_start + num_bytes > | 5418 | if (search_start + num_bytes > |
5192 | block_group->key.objectid + block_group->key.offset) { | 5419 | used_block_group->key.objectid + used_block_group->key.offset) { |
5193 | btrfs_add_free_space(block_group, offset, num_bytes); | 5420 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5194 | goto loop; | 5421 | goto loop; |
5195 | } | 5422 | } |
5196 | 5423 | ||
@@ -5198,14 +5425,14 @@ checks: | |||
5198 | ins->offset = num_bytes; | 5425 | ins->offset = num_bytes; |
5199 | 5426 | ||
5200 | if (offset < search_start) | 5427 | if (offset < search_start) |
5201 | btrfs_add_free_space(block_group, offset, | 5428 | btrfs_add_free_space(used_block_group, offset, |
5202 | search_start - offset); | 5429 | search_start - offset); |
5203 | BUG_ON(offset > search_start); | 5430 | BUG_ON(offset > search_start); |
5204 | 5431 | ||
5205 | ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, | 5432 | ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, |
5206 | (data & BTRFS_BLOCK_GROUP_DATA)); | 5433 | alloc_type); |
5207 | if (ret == -EAGAIN) { | 5434 | if (ret == -EAGAIN) { |
5208 | btrfs_add_free_space(block_group, offset, num_bytes); | 5435 | btrfs_add_free_space(used_block_group, offset, num_bytes); |
5209 | goto loop; | 5436 | goto loop; |
5210 | } | 5437 | } |
5211 | 5438 | ||
@@ -5214,19 +5441,26 @@ checks: | |||
5214 | ins->offset = num_bytes; | 5441 | ins->offset = num_bytes; |
5215 | 5442 | ||
5216 | if (offset < search_start) | 5443 | if (offset < search_start) |
5217 | btrfs_add_free_space(block_group, offset, | 5444 | btrfs_add_free_space(used_block_group, offset, |
5218 | search_start - offset); | 5445 | search_start - offset); |
5219 | BUG_ON(offset > search_start); | 5446 | BUG_ON(offset > search_start); |
5447 | if (used_block_group != block_group) | ||
5448 | btrfs_put_block_group(used_block_group); | ||
5220 | btrfs_put_block_group(block_group); | 5449 | btrfs_put_block_group(block_group); |
5221 | break; | 5450 | break; |
5222 | loop: | 5451 | loop: |
5223 | failed_cluster_refill = false; | 5452 | failed_cluster_refill = false; |
5224 | failed_alloc = false; | 5453 | failed_alloc = false; |
5225 | BUG_ON(index != get_block_group_index(block_group)); | 5454 | BUG_ON(index != get_block_group_index(block_group)); |
5455 | if (used_block_group != block_group) | ||
5456 | btrfs_put_block_group(used_block_group); | ||
5226 | btrfs_put_block_group(block_group); | 5457 | btrfs_put_block_group(block_group); |
5227 | } | 5458 | } |
5228 | up_read(&space_info->groups_sem); | 5459 | up_read(&space_info->groups_sem); |
5229 | 5460 | ||
5461 | if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) | ||
5462 | goto search; | ||
5463 | |||
5230 | if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) | 5464 | if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) |
5231 | goto search; | 5465 | goto search; |
5232 | 5466 | ||
@@ -5325,7 +5559,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, | |||
5325 | int index = 0; | 5559 | int index = 0; |
5326 | 5560 | ||
5327 | spin_lock(&info->lock); | 5561 | spin_lock(&info->lock); |
5328 | printk(KERN_INFO "space_info has %llu free, is %sfull\n", | 5562 | printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", |
5563 | (unsigned long long)info->flags, | ||
5329 | (unsigned long long)(info->total_bytes - info->bytes_used - | 5564 | (unsigned long long)(info->total_bytes - info->bytes_used - |
5330 | info->bytes_pinned - info->bytes_reserved - | 5565 | info->bytes_pinned - info->bytes_reserved - |
5331 | info->bytes_readonly), | 5566 | info->bytes_readonly), |
@@ -5411,7 +5646,8 @@ again: | |||
5411 | return ret; | 5646 | return ret; |
5412 | } | 5647 | } |
5413 | 5648 | ||
5414 | int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) | 5649 | static int __btrfs_free_reserved_extent(struct btrfs_root *root, |
5650 | u64 start, u64 len, int pin) | ||
5415 | { | 5651 | { |
5416 | struct btrfs_block_group_cache *cache; | 5652 | struct btrfs_block_group_cache *cache; |
5417 | int ret = 0; | 5653 | int ret = 0; |
@@ -5426,8 +5662,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) | |||
5426 | if (btrfs_test_opt(root, DISCARD)) | 5662 | if (btrfs_test_opt(root, DISCARD)) |
5427 | ret = btrfs_discard_extent(root, start, len, NULL); | 5663 | ret = btrfs_discard_extent(root, start, len, NULL); |
5428 | 5664 | ||
5429 | btrfs_add_free_space(cache, start, len); | 5665 | if (pin) |
5430 | btrfs_update_reserved_bytes(cache, len, 0, 1); | 5666 | pin_down_extent(root, cache, start, len, 1); |
5667 | else { | ||
5668 | btrfs_add_free_space(cache, start, len); | ||
5669 | btrfs_update_reserved_bytes(cache, len, RESERVE_FREE); | ||
5670 | } | ||
5431 | btrfs_put_block_group(cache); | 5671 | btrfs_put_block_group(cache); |
5432 | 5672 | ||
5433 | trace_btrfs_reserved_extent_free(root, start, len); | 5673 | trace_btrfs_reserved_extent_free(root, start, len); |
@@ -5435,6 +5675,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) | |||
5435 | return ret; | 5675 | return ret; |
5436 | } | 5676 | } |
5437 | 5677 | ||
5678 | int btrfs_free_reserved_extent(struct btrfs_root *root, | ||
5679 | u64 start, u64 len) | ||
5680 | { | ||
5681 | return __btrfs_free_reserved_extent(root, start, len, 0); | ||
5682 | } | ||
5683 | |||
5684 | int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, | ||
5685 | u64 start, u64 len) | ||
5686 | { | ||
5687 | return __btrfs_free_reserved_extent(root, start, len, 1); | ||
5688 | } | ||
5689 | |||
5438 | static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, | 5690 | static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, |
5439 | struct btrfs_root *root, | 5691 | struct btrfs_root *root, |
5440 | u64 parent, u64 root_objectid, | 5692 | u64 parent, u64 root_objectid, |
@@ -5630,7 +5882,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, | |||
5630 | put_caching_control(caching_ctl); | 5882 | put_caching_control(caching_ctl); |
5631 | } | 5883 | } |
5632 | 5884 | ||
5633 | ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); | 5885 | ret = btrfs_update_reserved_bytes(block_group, ins->offset, |
5886 | RESERVE_ALLOC_NO_ACCOUNT); | ||
5634 | BUG_ON(ret); | 5887 | BUG_ON(ret); |
5635 | btrfs_put_block_group(block_group); | 5888 | btrfs_put_block_group(block_group); |
5636 | ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, | 5889 | ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, |
@@ -5687,8 +5940,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
5687 | block_rsv = get_block_rsv(trans, root); | 5940 | block_rsv = get_block_rsv(trans, root); |
5688 | 5941 | ||
5689 | if (block_rsv->size == 0) { | 5942 | if (block_rsv->size == 0) { |
5690 | ret = reserve_metadata_bytes(trans, root, block_rsv, | 5943 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); |
5691 | blocksize, 0); | ||
5692 | /* | 5944 | /* |
5693 | * If we couldn't reserve metadata bytes try and use some from | 5945 | * If we couldn't reserve metadata bytes try and use some from |
5694 | * the global reserve. | 5946 | * the global reserve. |
@@ -5708,13 +5960,15 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
5708 | if (!ret) | 5960 | if (!ret) |
5709 | return block_rsv; | 5961 | return block_rsv; |
5710 | if (ret) { | 5962 | if (ret) { |
5711 | WARN_ON(1); | 5963 | static DEFINE_RATELIMIT_STATE(_rs, |
5712 | ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, | 5964 | DEFAULT_RATELIMIT_INTERVAL, |
5713 | 0); | 5965 | /*DEFAULT_RATELIMIT_BURST*/ 2); |
5966 | if (__ratelimit(&_rs)) { | ||
5967 | printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret); | ||
5968 | WARN_ON(1); | ||
5969 | } | ||
5970 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0); | ||
5714 | if (!ret) { | 5971 | if (!ret) { |
5715 | spin_lock(&block_rsv->lock); | ||
5716 | block_rsv->size += blocksize; | ||
5717 | spin_unlock(&block_rsv->lock); | ||
5718 | return block_rsv; | 5972 | return block_rsv; |
5719 | } else if (ret && block_rsv != global_rsv) { | 5973 | } else if (ret && block_rsv != global_rsv) { |
5720 | ret = block_rsv_use_bytes(global_rsv, blocksize); | 5974 | ret = block_rsv_use_bytes(global_rsv, blocksize); |
@@ -6592,12 +6846,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) | |||
6592 | cache->bytes_super - btrfs_block_group_used(&cache->item); | 6846 | cache->bytes_super - btrfs_block_group_used(&cache->item); |
6593 | 6847 | ||
6594 | if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + | 6848 | if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + |
6595 | sinfo->bytes_may_use + sinfo->bytes_readonly + | 6849 | sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes + |
6596 | cache->reserved_pinned + num_bytes + min_allocable_bytes <= | 6850 | min_allocable_bytes <= sinfo->total_bytes) { |
6597 | sinfo->total_bytes) { | ||
6598 | sinfo->bytes_readonly += num_bytes; | 6851 | sinfo->bytes_readonly += num_bytes; |
6599 | sinfo->bytes_reserved += cache->reserved_pinned; | ||
6600 | cache->reserved_pinned = 0; | ||
6601 | cache->ro = 1; | 6852 | cache->ro = 1; |
6602 | ret = 0; | 6853 | ret = 0; |
6603 | } | 6854 | } |
@@ -6964,7 +7215,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
6964 | struct btrfs_space_info, | 7215 | struct btrfs_space_info, |
6965 | list); | 7216 | list); |
6966 | if (space_info->bytes_pinned > 0 || | 7217 | if (space_info->bytes_pinned > 0 || |
6967 | space_info->bytes_reserved > 0) { | 7218 | space_info->bytes_reserved > 0 || |
7219 | space_info->bytes_may_use > 0) { | ||
6968 | WARN_ON(1); | 7220 | WARN_ON(1); |
6969 | dump_space_info(space_info, 0, 0); | 7221 | dump_space_info(space_info, 0, 0); |
6970 | } | 7222 | } |
@@ -7006,14 +7258,12 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7006 | return -ENOMEM; | 7258 | return -ENOMEM; |
7007 | path->reada = 1; | 7259 | path->reada = 1; |
7008 | 7260 | ||
7009 | cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); | 7261 | cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); |
7010 | if (cache_gen != 0 && | 7262 | if (btrfs_test_opt(root, SPACE_CACHE) && |
7011 | btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) | 7263 | btrfs_super_generation(root->fs_info->super_copy) != cache_gen) |
7012 | need_clear = 1; | 7264 | need_clear = 1; |
7013 | if (btrfs_test_opt(root, CLEAR_CACHE)) | 7265 | if (btrfs_test_opt(root, CLEAR_CACHE)) |
7014 | need_clear = 1; | 7266 | need_clear = 1; |
7015 | if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) | ||
7016 | printk(KERN_INFO "btrfs: disk space caching is enabled\n"); | ||
7017 | 7267 | ||
7018 | while (1) { | 7268 | while (1) { |
7019 | ret = find_first_block_group(root, path, &key); | 7269 | ret = find_first_block_group(root, path, &key); |
@@ -7252,7 +7502,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7252 | goto out; | 7502 | goto out; |
7253 | } | 7503 | } |
7254 | 7504 | ||
7255 | inode = lookup_free_space_inode(root, block_group, path); | 7505 | inode = lookup_free_space_inode(tree_root, block_group, path); |
7256 | if (!IS_ERR(inode)) { | 7506 | if (!IS_ERR(inode)) { |
7257 | ret = btrfs_orphan_add(trans, inode); | 7507 | ret = btrfs_orphan_add(trans, inode); |
7258 | BUG_ON(ret); | 7508 | BUG_ON(ret); |
@@ -7268,7 +7518,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7268 | spin_unlock(&block_group->lock); | 7518 | spin_unlock(&block_group->lock); |
7269 | } | 7519 | } |
7270 | /* One for our lookup ref */ | 7520 | /* One for our lookup ref */ |
7271 | iput(inode); | 7521 | btrfs_add_delayed_iput(inode); |
7272 | } | 7522 | } |
7273 | 7523 | ||
7274 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; | 7524 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; |
@@ -7339,7 +7589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) | |||
7339 | int mixed = 0; | 7589 | int mixed = 0; |
7340 | int ret; | 7590 | int ret; |
7341 | 7591 | ||
7342 | disk_super = &fs_info->super_copy; | 7592 | disk_super = fs_info->super_copy; |
7343 | if (!btrfs_super_root(disk_super)) | 7593 | if (!btrfs_super_root(disk_super)) |
7344 | return 1; | 7594 | return 1; |
7345 | 7595 | ||
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f..49f3c9dc09f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include "compat.h" | 17 | #include "compat.h" |
18 | #include "ctree.h" | 18 | #include "ctree.h" |
19 | #include "btrfs_inode.h" | 19 | #include "btrfs_inode.h" |
20 | #include "volumes.h" | ||
20 | 21 | ||
21 | static struct kmem_cache *extent_state_cache; | 22 | static struct kmem_cache *extent_state_cache; |
22 | static struct kmem_cache *extent_buffer_cache; | 23 | static struct kmem_cache *extent_buffer_cache; |
@@ -894,6 +895,202 @@ search_again: | |||
894 | goto again; | 895 | goto again; |
895 | } | 896 | } |
896 | 897 | ||
898 | /** | ||
899 | * convert_extent - convert all bits in a given range from one bit to another | ||
900 | * @tree: the io tree to search | ||
901 | * @start: the start offset in bytes | ||
902 | * @end: the end offset in bytes (inclusive) | ||
903 | * @bits: the bits to set in this range | ||
904 | * @clear_bits: the bits to clear in this range | ||
905 | * @mask: the allocation mask | ||
906 | * | ||
907 | * This will go through and set bits for the given range. If any states exist | ||
908 | * already in this range they are set with the given bit and cleared of the | ||
909 | * clear_bits. This is only meant to be used by things that are mergeable, ie | ||
910 | * converting from say DELALLOC to DIRTY. This is not meant to be used with | ||
911 | * boundary bits like LOCK. | ||
912 | */ | ||
913 | int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
914 | int bits, int clear_bits, gfp_t mask) | ||
915 | { | ||
916 | struct extent_state *state; | ||
917 | struct extent_state *prealloc = NULL; | ||
918 | struct rb_node *node; | ||
919 | int err = 0; | ||
920 | u64 last_start; | ||
921 | u64 last_end; | ||
922 | |||
923 | again: | ||
924 | if (!prealloc && (mask & __GFP_WAIT)) { | ||
925 | prealloc = alloc_extent_state(mask); | ||
926 | if (!prealloc) | ||
927 | return -ENOMEM; | ||
928 | } | ||
929 | |||
930 | spin_lock(&tree->lock); | ||
931 | /* | ||
932 | * this search will find all the extents that end after | ||
933 | * our range starts. | ||
934 | */ | ||
935 | node = tree_search(tree, start); | ||
936 | if (!node) { | ||
937 | prealloc = alloc_extent_state_atomic(prealloc); | ||
938 | if (!prealloc) { | ||
939 | err = -ENOMEM; | ||
940 | goto out; | ||
941 | } | ||
942 | err = insert_state(tree, prealloc, start, end, &bits); | ||
943 | prealloc = NULL; | ||
944 | BUG_ON(err == -EEXIST); | ||
945 | goto out; | ||
946 | } | ||
947 | state = rb_entry(node, struct extent_state, rb_node); | ||
948 | hit_next: | ||
949 | last_start = state->start; | ||
950 | last_end = state->end; | ||
951 | |||
952 | /* | ||
953 | * | ---- desired range ---- | | ||
954 | * | state | | ||
955 | * | ||
956 | * Just lock what we found and keep going | ||
957 | */ | ||
958 | if (state->start == start && state->end <= end) { | ||
959 | struct rb_node *next_node; | ||
960 | |||
961 | set_state_bits(tree, state, &bits); | ||
962 | clear_state_bit(tree, state, &clear_bits, 0); | ||
963 | |||
964 | merge_state(tree, state); | ||
965 | if (last_end == (u64)-1) | ||
966 | goto out; | ||
967 | |||
968 | start = last_end + 1; | ||
969 | next_node = rb_next(&state->rb_node); | ||
970 | if (next_node && start < end && prealloc && !need_resched()) { | ||
971 | state = rb_entry(next_node, struct extent_state, | ||
972 | rb_node); | ||
973 | if (state->start == start) | ||
974 | goto hit_next; | ||
975 | } | ||
976 | goto search_again; | ||
977 | } | ||
978 | |||
979 | /* | ||
980 | * | ---- desired range ---- | | ||
981 | * | state | | ||
982 | * or | ||
983 | * | ------------- state -------------- | | ||
984 | * | ||
985 | * We need to split the extent we found, and may flip bits on | ||
986 | * second half. | ||
987 | * | ||
988 | * If the extent we found extends past our | ||
989 | * range, we just split and search again. It'll get split | ||
990 | * again the next time though. | ||
991 | * | ||
992 | * If the extent we found is inside our range, we set the | ||
993 | * desired bit on it. | ||
994 | */ | ||
995 | if (state->start < start) { | ||
996 | prealloc = alloc_extent_state_atomic(prealloc); | ||
997 | if (!prealloc) { | ||
998 | err = -ENOMEM; | ||
999 | goto out; | ||
1000 | } | ||
1001 | err = split_state(tree, state, prealloc, start); | ||
1002 | BUG_ON(err == -EEXIST); | ||
1003 | prealloc = NULL; | ||
1004 | if (err) | ||
1005 | goto out; | ||
1006 | if (state->end <= end) { | ||
1007 | set_state_bits(tree, state, &bits); | ||
1008 | clear_state_bit(tree, state, &clear_bits, 0); | ||
1009 | merge_state(tree, state); | ||
1010 | if (last_end == (u64)-1) | ||
1011 | goto out; | ||
1012 | start = last_end + 1; | ||
1013 | } | ||
1014 | goto search_again; | ||
1015 | } | ||
1016 | /* | ||
1017 | * | ---- desired range ---- | | ||
1018 | * | state | or | state | | ||
1019 | * | ||
1020 | * There's a hole, we need to insert something in it and | ||
1021 | * ignore the extent we found. | ||
1022 | */ | ||
1023 | if (state->start > start) { | ||
1024 | u64 this_end; | ||
1025 | if (end < last_start) | ||
1026 | this_end = end; | ||
1027 | else | ||
1028 | this_end = last_start - 1; | ||
1029 | |||
1030 | prealloc = alloc_extent_state_atomic(prealloc); | ||
1031 | if (!prealloc) { | ||
1032 | err = -ENOMEM; | ||
1033 | goto out; | ||
1034 | } | ||
1035 | |||
1036 | /* | ||
1037 | * Avoid to free 'prealloc' if it can be merged with | ||
1038 | * the later extent. | ||
1039 | */ | ||
1040 | err = insert_state(tree, prealloc, start, this_end, | ||
1041 | &bits); | ||
1042 | BUG_ON(err == -EEXIST); | ||
1043 | if (err) { | ||
1044 | free_extent_state(prealloc); | ||
1045 | prealloc = NULL; | ||
1046 | goto out; | ||
1047 | } | ||
1048 | prealloc = NULL; | ||
1049 | start = this_end + 1; | ||
1050 | goto search_again; | ||
1051 | } | ||
1052 | /* | ||
1053 | * | ---- desired range ---- | | ||
1054 | * | state | | ||
1055 | * We need to split the extent, and set the bit | ||
1056 | * on the first half | ||
1057 | */ | ||
1058 | if (state->start <= end && state->end > end) { | ||
1059 | prealloc = alloc_extent_state_atomic(prealloc); | ||
1060 | if (!prealloc) { | ||
1061 | err = -ENOMEM; | ||
1062 | goto out; | ||
1063 | } | ||
1064 | |||
1065 | err = split_state(tree, state, prealloc, end + 1); | ||
1066 | BUG_ON(err == -EEXIST); | ||
1067 | |||
1068 | set_state_bits(tree, prealloc, &bits); | ||
1069 | clear_state_bit(tree, prealloc, &clear_bits, 0); | ||
1070 | |||
1071 | merge_state(tree, prealloc); | ||
1072 | prealloc = NULL; | ||
1073 | goto out; | ||
1074 | } | ||
1075 | |||
1076 | goto search_again; | ||
1077 | |||
1078 | out: | ||
1079 | spin_unlock(&tree->lock); | ||
1080 | if (prealloc) | ||
1081 | free_extent_state(prealloc); | ||
1082 | |||
1083 | return err; | ||
1084 | |||
1085 | search_again: | ||
1086 | if (start > end) | ||
1087 | goto out; | ||
1088 | spin_unlock(&tree->lock); | ||
1089 | if (mask & __GFP_WAIT) | ||
1090 | cond_resched(); | ||
1091 | goto again; | ||
1092 | } | ||
1093 | |||
897 | /* wrappers around set/clear extent bit */ | 1094 | /* wrappers around set/clear extent bit */ |
898 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | 1095 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
899 | gfp_t mask) | 1096 | gfp_t mask) |
@@ -919,7 +1116,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | |||
919 | struct extent_state **cached_state, gfp_t mask) | 1116 | struct extent_state **cached_state, gfp_t mask) |
920 | { | 1117 | { |
921 | return set_extent_bit(tree, start, end, | 1118 | return set_extent_bit(tree, start, end, |
922 | EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, | 1119 | EXTENT_DELALLOC | EXTENT_UPTODATE, |
923 | 0, NULL, cached_state, mask); | 1120 | 0, NULL, cached_state, mask); |
924 | } | 1121 | } |
925 | 1122 | ||
@@ -1599,6 +1796,368 @@ static int check_page_writeback(struct extent_io_tree *tree, | |||
1599 | return 0; | 1796 | return 0; |
1600 | } | 1797 | } |
1601 | 1798 | ||
1799 | /* | ||
1800 | * When IO fails, either with EIO or csum verification fails, we | ||
1801 | * try other mirrors that might have a good copy of the data. This | ||
1802 | * io_failure_record is used to record state as we go through all the | ||
1803 | * mirrors. If another mirror has good data, the page is set up to date | ||
1804 | * and things continue. If a good mirror can't be found, the original | ||
1805 | * bio end_io callback is called to indicate things have failed. | ||
1806 | */ | ||
1807 | struct io_failure_record { | ||
1808 | struct page *page; | ||
1809 | u64 start; | ||
1810 | u64 len; | ||
1811 | u64 logical; | ||
1812 | unsigned long bio_flags; | ||
1813 | int this_mirror; | ||
1814 | int failed_mirror; | ||
1815 | int in_validation; | ||
1816 | }; | ||
1817 | |||
1818 | static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | ||
1819 | int did_repair) | ||
1820 | { | ||
1821 | int ret; | ||
1822 | int err = 0; | ||
1823 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1824 | |||
1825 | set_state_private(failure_tree, rec->start, 0); | ||
1826 | ret = clear_extent_bits(failure_tree, rec->start, | ||
1827 | rec->start + rec->len - 1, | ||
1828 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
1829 | if (ret) | ||
1830 | err = ret; | ||
1831 | |||
1832 | if (did_repair) { | ||
1833 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | ||
1834 | rec->start + rec->len - 1, | ||
1835 | EXTENT_DAMAGED, GFP_NOFS); | ||
1836 | if (ret && !err) | ||
1837 | err = ret; | ||
1838 | } | ||
1839 | |||
1840 | kfree(rec); | ||
1841 | return err; | ||
1842 | } | ||
1843 | |||
1844 | static void repair_io_failure_callback(struct bio *bio, int err) | ||
1845 | { | ||
1846 | complete(bio->bi_private); | ||
1847 | } | ||
1848 | |||
1849 | /* | ||
1850 | * this bypasses the standard btrfs submit functions deliberately, as | ||
1851 | * the standard behavior is to write all copies in a raid setup. here we only | ||
1852 | * want to write the one bad copy. so we do the mapping for ourselves and issue | ||
1853 | * submit_bio directly. | ||
1854 | * to avoid any synchonization issues, wait for the data after writing, which | ||
1855 | * actually prevents the read that triggered the error from finishing. | ||
1856 | * currently, there can be no more than two copies of every data bit. thus, | ||
1857 | * exactly one rewrite is required. | ||
1858 | */ | ||
1859 | int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | ||
1860 | u64 length, u64 logical, struct page *page, | ||
1861 | int mirror_num) | ||
1862 | { | ||
1863 | struct bio *bio; | ||
1864 | struct btrfs_device *dev; | ||
1865 | DECLARE_COMPLETION_ONSTACK(compl); | ||
1866 | u64 map_length = 0; | ||
1867 | u64 sector; | ||
1868 | struct btrfs_bio *bbio = NULL; | ||
1869 | int ret; | ||
1870 | |||
1871 | BUG_ON(!mirror_num); | ||
1872 | |||
1873 | bio = bio_alloc(GFP_NOFS, 1); | ||
1874 | if (!bio) | ||
1875 | return -EIO; | ||
1876 | bio->bi_private = &compl; | ||
1877 | bio->bi_end_io = repair_io_failure_callback; | ||
1878 | bio->bi_size = 0; | ||
1879 | map_length = length; | ||
1880 | |||
1881 | ret = btrfs_map_block(map_tree, WRITE, logical, | ||
1882 | &map_length, &bbio, mirror_num); | ||
1883 | if (ret) { | ||
1884 | bio_put(bio); | ||
1885 | return -EIO; | ||
1886 | } | ||
1887 | BUG_ON(mirror_num != bbio->mirror_num); | ||
1888 | sector = bbio->stripes[mirror_num-1].physical >> 9; | ||
1889 | bio->bi_sector = sector; | ||
1890 | dev = bbio->stripes[mirror_num-1].dev; | ||
1891 | kfree(bbio); | ||
1892 | if (!dev || !dev->bdev || !dev->writeable) { | ||
1893 | bio_put(bio); | ||
1894 | return -EIO; | ||
1895 | } | ||
1896 | bio->bi_bdev = dev->bdev; | ||
1897 | bio_add_page(bio, page, length, start-page_offset(page)); | ||
1898 | submit_bio(WRITE_SYNC, bio); | ||
1899 | wait_for_completion(&compl); | ||
1900 | |||
1901 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
1902 | /* try to remap that extent elsewhere? */ | ||
1903 | bio_put(bio); | ||
1904 | return -EIO; | ||
1905 | } | ||
1906 | |||
1907 | printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " | ||
1908 | "sector %llu)\n", page->mapping->host->i_ino, start, | ||
1909 | dev->name, sector); | ||
1910 | |||
1911 | bio_put(bio); | ||
1912 | return 0; | ||
1913 | } | ||
1914 | |||
1915 | /* | ||
1916 | * each time an IO finishes, we do a fast check in the IO failure tree | ||
1917 | * to see if we need to process or clean up an io_failure_record | ||
1918 | */ | ||
1919 | static int clean_io_failure(u64 start, struct page *page) | ||
1920 | { | ||
1921 | u64 private; | ||
1922 | u64 private_failure; | ||
1923 | struct io_failure_record *failrec; | ||
1924 | struct btrfs_mapping_tree *map_tree; | ||
1925 | struct extent_state *state; | ||
1926 | int num_copies; | ||
1927 | int did_repair = 0; | ||
1928 | int ret; | ||
1929 | struct inode *inode = page->mapping->host; | ||
1930 | |||
1931 | private = 0; | ||
1932 | ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, | ||
1933 | (u64)-1, 1, EXTENT_DIRTY, 0); | ||
1934 | if (!ret) | ||
1935 | return 0; | ||
1936 | |||
1937 | ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, | ||
1938 | &private_failure); | ||
1939 | if (ret) | ||
1940 | return 0; | ||
1941 | |||
1942 | failrec = (struct io_failure_record *)(unsigned long) private_failure; | ||
1943 | BUG_ON(!failrec->this_mirror); | ||
1944 | |||
1945 | if (failrec->in_validation) { | ||
1946 | /* there was no real error, just free the record */ | ||
1947 | pr_debug("clean_io_failure: freeing dummy error at %llu\n", | ||
1948 | failrec->start); | ||
1949 | did_repair = 1; | ||
1950 | goto out; | ||
1951 | } | ||
1952 | |||
1953 | spin_lock(&BTRFS_I(inode)->io_tree.lock); | ||
1954 | state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, | ||
1955 | failrec->start, | ||
1956 | EXTENT_LOCKED); | ||
1957 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); | ||
1958 | |||
1959 | if (state && state->start == failrec->start) { | ||
1960 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | ||
1961 | num_copies = btrfs_num_copies(map_tree, failrec->logical, | ||
1962 | failrec->len); | ||
1963 | if (num_copies > 1) { | ||
1964 | ret = repair_io_failure(map_tree, start, failrec->len, | ||
1965 | failrec->logical, page, | ||
1966 | failrec->failed_mirror); | ||
1967 | did_repair = !ret; | ||
1968 | } | ||
1969 | } | ||
1970 | |||
1971 | out: | ||
1972 | if (!ret) | ||
1973 | ret = free_io_failure(inode, failrec, did_repair); | ||
1974 | |||
1975 | return ret; | ||
1976 | } | ||
1977 | |||
1978 | /* | ||
1979 | * this is a generic handler for readpage errors (default | ||
1980 | * readpage_io_failed_hook). if other copies exist, read those and write back | ||
1981 | * good data to the failed position. does not investigate in remapping the | ||
1982 | * failed extent elsewhere, hoping the device will be smart enough to do this as | ||
1983 | * needed | ||
1984 | */ | ||
1985 | |||
1986 | static int bio_readpage_error(struct bio *failed_bio, struct page *page, | ||
1987 | u64 start, u64 end, int failed_mirror, | ||
1988 | struct extent_state *state) | ||
1989 | { | ||
1990 | struct io_failure_record *failrec = NULL; | ||
1991 | u64 private; | ||
1992 | struct extent_map *em; | ||
1993 | struct inode *inode = page->mapping->host; | ||
1994 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1995 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
1996 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
1997 | struct bio *bio; | ||
1998 | int num_copies; | ||
1999 | int ret; | ||
2000 | int read_mode; | ||
2001 | u64 logical; | ||
2002 | |||
2003 | BUG_ON(failed_bio->bi_rw & REQ_WRITE); | ||
2004 | |||
2005 | ret = get_state_private(failure_tree, start, &private); | ||
2006 | if (ret) { | ||
2007 | failrec = kzalloc(sizeof(*failrec), GFP_NOFS); | ||
2008 | if (!failrec) | ||
2009 | return -ENOMEM; | ||
2010 | failrec->start = start; | ||
2011 | failrec->len = end - start + 1; | ||
2012 | failrec->this_mirror = 0; | ||
2013 | failrec->bio_flags = 0; | ||
2014 | failrec->in_validation = 0; | ||
2015 | |||
2016 | read_lock(&em_tree->lock); | ||
2017 | em = lookup_extent_mapping(em_tree, start, failrec->len); | ||
2018 | if (!em) { | ||
2019 | read_unlock(&em_tree->lock); | ||
2020 | kfree(failrec); | ||
2021 | return -EIO; | ||
2022 | } | ||
2023 | |||
2024 | if (em->start > start || em->start + em->len < start) { | ||
2025 | free_extent_map(em); | ||
2026 | em = NULL; | ||
2027 | } | ||
2028 | read_unlock(&em_tree->lock); | ||
2029 | |||
2030 | if (!em || IS_ERR(em)) { | ||
2031 | kfree(failrec); | ||
2032 | return -EIO; | ||
2033 | } | ||
2034 | logical = start - em->start; | ||
2035 | logical = em->block_start + logical; | ||
2036 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | ||
2037 | logical = em->block_start; | ||
2038 | failrec->bio_flags = EXTENT_BIO_COMPRESSED; | ||
2039 | extent_set_compress_type(&failrec->bio_flags, | ||
2040 | em->compress_type); | ||
2041 | } | ||
2042 | pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " | ||
2043 | "len=%llu\n", logical, start, failrec->len); | ||
2044 | failrec->logical = logical; | ||
2045 | free_extent_map(em); | ||
2046 | |||
2047 | /* set the bits in the private failure tree */ | ||
2048 | ret = set_extent_bits(failure_tree, start, end, | ||
2049 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
2050 | if (ret >= 0) | ||
2051 | ret = set_state_private(failure_tree, start, | ||
2052 | (u64)(unsigned long)failrec); | ||
2053 | /* set the bits in the inode's tree */ | ||
2054 | if (ret >= 0) | ||
2055 | ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, | ||
2056 | GFP_NOFS); | ||
2057 | if (ret < 0) { | ||
2058 | kfree(failrec); | ||
2059 | return ret; | ||
2060 | } | ||
2061 | } else { | ||
2062 | failrec = (struct io_failure_record *)(unsigned long)private; | ||
2063 | pr_debug("bio_readpage_error: (found) logical=%llu, " | ||
2064 | "start=%llu, len=%llu, validation=%d\n", | ||
2065 | failrec->logical, failrec->start, failrec->len, | ||
2066 | failrec->in_validation); | ||
2067 | /* | ||
2068 | * when data can be on disk more than twice, add to failrec here | ||
2069 | * (e.g. with a list for failed_mirror) to make | ||
2070 | * clean_io_failure() clean all those errors at once. | ||
2071 | */ | ||
2072 | } | ||
2073 | num_copies = btrfs_num_copies( | ||
2074 | &BTRFS_I(inode)->root->fs_info->mapping_tree, | ||
2075 | failrec->logical, failrec->len); | ||
2076 | if (num_copies == 1) { | ||
2077 | /* | ||
2078 | * we only have a single copy of the data, so don't bother with | ||
2079 | * all the retry and error correction code that follows. no | ||
2080 | * matter what the error is, it is very likely to persist. | ||
2081 | */ | ||
2082 | pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " | ||
2083 | "state=%p, num_copies=%d, next_mirror %d, " | ||
2084 | "failed_mirror %d\n", state, num_copies, | ||
2085 | failrec->this_mirror, failed_mirror); | ||
2086 | free_io_failure(inode, failrec, 0); | ||
2087 | return -EIO; | ||
2088 | } | ||
2089 | |||
2090 | if (!state) { | ||
2091 | spin_lock(&tree->lock); | ||
2092 | state = find_first_extent_bit_state(tree, failrec->start, | ||
2093 | EXTENT_LOCKED); | ||
2094 | if (state && state->start != failrec->start) | ||
2095 | state = NULL; | ||
2096 | spin_unlock(&tree->lock); | ||
2097 | } | ||
2098 | |||
2099 | /* | ||
2100 | * there are two premises: | ||
2101 | * a) deliver good data to the caller | ||
2102 | * b) correct the bad sectors on disk | ||
2103 | */ | ||
2104 | if (failed_bio->bi_vcnt > 1) { | ||
2105 | /* | ||
2106 | * to fulfill b), we need to know the exact failing sectors, as | ||
2107 | * we don't want to rewrite any more than the failed ones. thus, | ||
2108 | * we need separate read requests for the failed bio | ||
2109 | * | ||
2110 | * if the following BUG_ON triggers, our validation request got | ||
2111 | * merged. we need separate requests for our algorithm to work. | ||
2112 | */ | ||
2113 | BUG_ON(failrec->in_validation); | ||
2114 | failrec->in_validation = 1; | ||
2115 | failrec->this_mirror = failed_mirror; | ||
2116 | read_mode = READ_SYNC | REQ_FAILFAST_DEV; | ||
2117 | } else { | ||
2118 | /* | ||
2119 | * we're ready to fulfill a) and b) alongside. get a good copy | ||
2120 | * of the failed sector and if we succeed, we have setup | ||
2121 | * everything for repair_io_failure to do the rest for us. | ||
2122 | */ | ||
2123 | if (failrec->in_validation) { | ||
2124 | BUG_ON(failrec->this_mirror != failed_mirror); | ||
2125 | failrec->in_validation = 0; | ||
2126 | failrec->this_mirror = 0; | ||
2127 | } | ||
2128 | failrec->failed_mirror = failed_mirror; | ||
2129 | failrec->this_mirror++; | ||
2130 | if (failrec->this_mirror == failed_mirror) | ||
2131 | failrec->this_mirror++; | ||
2132 | read_mode = READ_SYNC; | ||
2133 | } | ||
2134 | |||
2135 | if (!state || failrec->this_mirror > num_copies) { | ||
2136 | pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " | ||
2137 | "next_mirror %d, failed_mirror %d\n", state, | ||
2138 | num_copies, failrec->this_mirror, failed_mirror); | ||
2139 | free_io_failure(inode, failrec, 0); | ||
2140 | return -EIO; | ||
2141 | } | ||
2142 | |||
2143 | bio = bio_alloc(GFP_NOFS, 1); | ||
2144 | bio->bi_private = state; | ||
2145 | bio->bi_end_io = failed_bio->bi_end_io; | ||
2146 | bio->bi_sector = failrec->logical >> 9; | ||
2147 | bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
2148 | bio->bi_size = 0; | ||
2149 | |||
2150 | bio_add_page(bio, page, failrec->len, start - page_offset(page)); | ||
2151 | |||
2152 | pr_debug("bio_readpage_error: submitting new read[%#x] to " | ||
2153 | "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, | ||
2154 | failrec->this_mirror, num_copies, failrec->in_validation); | ||
2155 | |||
2156 | tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, | ||
2157 | failrec->bio_flags, 0); | ||
2158 | return 0; | ||
2159 | } | ||
2160 | |||
1602 | /* lots and lots of room for performance fixes in the end_bio funcs */ | 2161 | /* lots and lots of room for performance fixes in the end_bio funcs */ |
1603 | 2162 | ||
1604 | /* | 2163 | /* |
@@ -1697,6 +2256,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
1697 | struct extent_state *cached = NULL; | 2256 | struct extent_state *cached = NULL; |
1698 | struct extent_state *state; | 2257 | struct extent_state *state; |
1699 | 2258 | ||
2259 | pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " | ||
2260 | "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, | ||
2261 | (long int)bio->bi_bdev); | ||
1700 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2262 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
1701 | 2263 | ||
1702 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2264 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
@@ -1727,12 +2289,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
1727 | state); | 2289 | state); |
1728 | if (ret) | 2290 | if (ret) |
1729 | uptodate = 0; | 2291 | uptodate = 0; |
2292 | else | ||
2293 | clean_io_failure(start, page); | ||
1730 | } | 2294 | } |
1731 | if (!uptodate && tree->ops && | 2295 | if (!uptodate) { |
1732 | tree->ops->readpage_io_failed_hook) { | 2296 | int failed_mirror; |
1733 | ret = tree->ops->readpage_io_failed_hook(bio, page, | 2297 | failed_mirror = (int)(unsigned long)bio->bi_bdev; |
1734 | start, end, NULL); | 2298 | /* |
2299 | * The generic bio_readpage_error handles errors the | ||
2300 | * following way: If possible, new read requests are | ||
2301 | * created and submitted and will end up in | ||
2302 | * end_bio_extent_readpage as well (if we're lucky, not | ||
2303 | * in the !uptodate case). In that case it returns 0 and | ||
2304 | * we just go on with the next page in our bio. If it | ||
2305 | * can't handle the error it will return -EIO and we | ||
2306 | * remain responsible for that page. | ||
2307 | */ | ||
2308 | ret = bio_readpage_error(bio, page, start, end, | ||
2309 | failed_mirror, NULL); | ||
1735 | if (ret == 0) { | 2310 | if (ret == 0) { |
2311 | error_handled: | ||
1736 | uptodate = | 2312 | uptodate = |
1737 | test_bit(BIO_UPTODATE, &bio->bi_flags); | 2313 | test_bit(BIO_UPTODATE, &bio->bi_flags); |
1738 | if (err) | 2314 | if (err) |
@@ -1740,6 +2316,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
1740 | uncache_state(&cached); | 2316 | uncache_state(&cached); |
1741 | continue; | 2317 | continue; |
1742 | } | 2318 | } |
2319 | if (tree->ops && tree->ops->readpage_io_failed_hook) { | ||
2320 | ret = tree->ops->readpage_io_failed_hook( | ||
2321 | bio, page, start, end, | ||
2322 | failed_mirror, state); | ||
2323 | if (ret == 0) | ||
2324 | goto error_handled; | ||
2325 | } | ||
1743 | } | 2326 | } |
1744 | 2327 | ||
1745 | if (uptodate) { | 2328 | if (uptodate) { |
@@ -1811,6 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, | |||
1811 | mirror_num, bio_flags, start); | 2394 | mirror_num, bio_flags, start); |
1812 | else | 2395 | else |
1813 | submit_bio(rw, bio); | 2396 | submit_bio(rw, bio); |
2397 | |||
1814 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2398 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
1815 | ret = -EOPNOTSUPP; | 2399 | ret = -EOPNOTSUPP; |
1816 | bio_put(bio); | 2400 | bio_put(bio); |
@@ -2076,16 +2660,16 @@ out: | |||
2076 | } | 2660 | } |
2077 | 2661 | ||
2078 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | 2662 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, |
2079 | get_extent_t *get_extent) | 2663 | get_extent_t *get_extent, int mirror_num) |
2080 | { | 2664 | { |
2081 | struct bio *bio = NULL; | 2665 | struct bio *bio = NULL; |
2082 | unsigned long bio_flags = 0; | 2666 | unsigned long bio_flags = 0; |
2083 | int ret; | 2667 | int ret; |
2084 | 2668 | ||
2085 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, | 2669 | ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, |
2086 | &bio_flags); | 2670 | &bio_flags); |
2087 | if (bio) | 2671 | if (bio) |
2088 | ret = submit_one_bio(READ, bio, 0, bio_flags); | 2672 | ret = submit_one_bio(READ, bio, mirror_num, bio_flags); |
2089 | return ret; | 2673 | return ret; |
2090 | } | 2674 | } |
2091 | 2675 | ||
@@ -2136,6 +2720,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2136 | int compressed; | 2720 | int compressed; |
2137 | int write_flags; | 2721 | int write_flags; |
2138 | unsigned long nr_written = 0; | 2722 | unsigned long nr_written = 0; |
2723 | bool fill_delalloc = true; | ||
2139 | 2724 | ||
2140 | if (wbc->sync_mode == WB_SYNC_ALL) | 2725 | if (wbc->sync_mode == WB_SYNC_ALL) |
2141 | write_flags = WRITE_SYNC; | 2726 | write_flags = WRITE_SYNC; |
@@ -2145,6 +2730,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2145 | trace___extent_writepage(page, inode, wbc); | 2730 | trace___extent_writepage(page, inode, wbc); |
2146 | 2731 | ||
2147 | WARN_ON(!PageLocked(page)); | 2732 | WARN_ON(!PageLocked(page)); |
2733 | |||
2734 | ClearPageError(page); | ||
2735 | |||
2148 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2736 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
2149 | if (page->index > end_index || | 2737 | if (page->index > end_index || |
2150 | (page->index == end_index && !pg_offset)) { | 2738 | (page->index == end_index && !pg_offset)) { |
@@ -2166,10 +2754,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2166 | 2754 | ||
2167 | set_page_extent_mapped(page); | 2755 | set_page_extent_mapped(page); |
2168 | 2756 | ||
2757 | if (!tree->ops || !tree->ops->fill_delalloc) | ||
2758 | fill_delalloc = false; | ||
2759 | |||
2169 | delalloc_start = start; | 2760 | delalloc_start = start; |
2170 | delalloc_end = 0; | 2761 | delalloc_end = 0; |
2171 | page_started = 0; | 2762 | page_started = 0; |
2172 | if (!epd->extent_locked) { | 2763 | if (!epd->extent_locked && fill_delalloc) { |
2173 | u64 delalloc_to_write = 0; | 2764 | u64 delalloc_to_write = 0; |
2174 | /* | 2765 | /* |
2175 | * make sure the wbc mapping index is at least updated | 2766 | * make sure the wbc mapping index is at least updated |
@@ -2421,10 +3012,16 @@ retry: | |||
2421 | * swizzled back from swapper_space to tmpfs file | 3012 | * swizzled back from swapper_space to tmpfs file |
2422 | * mapping | 3013 | * mapping |
2423 | */ | 3014 | */ |
2424 | if (tree->ops && tree->ops->write_cache_pages_lock_hook) | 3015 | if (tree->ops && |
2425 | tree->ops->write_cache_pages_lock_hook(page); | 3016 | tree->ops->write_cache_pages_lock_hook) { |
2426 | else | 3017 | tree->ops->write_cache_pages_lock_hook(page, |
2427 | lock_page(page); | 3018 | data, flush_fn); |
3019 | } else { | ||
3020 | if (!trylock_page(page)) { | ||
3021 | flush_fn(data); | ||
3022 | lock_page(page); | ||
3023 | } | ||
3024 | } | ||
2428 | 3025 | ||
2429 | if (unlikely(page->mapping != mapping)) { | 3026 | if (unlikely(page->mapping != mapping)) { |
2430 | unlock_page(page); | 3027 | unlock_page(page); |
@@ -2790,6 +3387,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
2790 | return -ENOMEM; | 3387 | return -ENOMEM; |
2791 | path->leave_spinning = 1; | 3388 | path->leave_spinning = 1; |
2792 | 3389 | ||
3390 | start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); | ||
3391 | len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); | ||
3392 | |||
2793 | /* | 3393 | /* |
2794 | * lookup the last file extent. We're not using i_size here | 3394 | * lookup the last file extent. We're not using i_size here |
2795 | * because there might be preallocation past i_size | 3395 | * because there might be preallocation past i_size |
@@ -2837,7 +3437,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | |||
2837 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, | 3437 | lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, |
2838 | &cached_state, GFP_NOFS); | 3438 | &cached_state, GFP_NOFS); |
2839 | 3439 | ||
2840 | em = get_extent_skip_holes(inode, off, last_for_get_extent, | 3440 | em = get_extent_skip_holes(inode, start, last_for_get_extent, |
2841 | get_extent); | 3441 | get_extent); |
2842 | if (!em) | 3442 | if (!em) |
2843 | goto out; | 3443 | goto out; |
@@ -2926,7 +3526,7 @@ out: | |||
2926 | return ret; | 3526 | return ret; |
2927 | } | 3527 | } |
2928 | 3528 | ||
2929 | static inline struct page *extent_buffer_page(struct extent_buffer *eb, | 3529 | inline struct page *extent_buffer_page(struct extent_buffer *eb, |
2930 | unsigned long i) | 3530 | unsigned long i) |
2931 | { | 3531 | { |
2932 | struct page *p; | 3532 | struct page *p; |
@@ -2951,7 +3551,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, | |||
2951 | return p; | 3551 | return p; |
2952 | } | 3552 | } |
2953 | 3553 | ||
2954 | static inline unsigned long num_extent_pages(u64 start, u64 len) | 3554 | inline unsigned long num_extent_pages(u64 start, u64 len) |
2955 | { | 3555 | { |
2956 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - | 3556 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - |
2957 | (start >> PAGE_CACHE_SHIFT); | 3557 | (start >> PAGE_CACHE_SHIFT); |
@@ -3204,6 +3804,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, | |||
3204 | PAGECACHE_TAG_DIRTY); | 3804 | PAGECACHE_TAG_DIRTY); |
3205 | } | 3805 | } |
3206 | spin_unlock_irq(&page->mapping->tree_lock); | 3806 | spin_unlock_irq(&page->mapping->tree_lock); |
3807 | ClearPageError(page); | ||
3207 | unlock_page(page); | 3808 | unlock_page(page); |
3208 | } | 3809 | } |
3209 | return 0; | 3810 | return 0; |
@@ -3349,8 +3950,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, | |||
3349 | } | 3950 | } |
3350 | 3951 | ||
3351 | int read_extent_buffer_pages(struct extent_io_tree *tree, | 3952 | int read_extent_buffer_pages(struct extent_io_tree *tree, |
3352 | struct extent_buffer *eb, | 3953 | struct extent_buffer *eb, u64 start, int wait, |
3353 | u64 start, int wait, | ||
3354 | get_extent_t *get_extent, int mirror_num) | 3954 | get_extent_t *get_extent, int mirror_num) |
3355 | { | 3955 | { |
3356 | unsigned long i; | 3956 | unsigned long i; |
@@ -3386,7 +3986,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
3386 | num_pages = num_extent_pages(eb->start, eb->len); | 3986 | num_pages = num_extent_pages(eb->start, eb->len); |
3387 | for (i = start_i; i < num_pages; i++) { | 3987 | for (i = start_i; i < num_pages; i++) { |
3388 | page = extent_buffer_page(eb, i); | 3988 | page = extent_buffer_page(eb, i); |
3389 | if (!wait) { | 3989 | if (wait == WAIT_NONE) { |
3390 | if (!trylock_page(page)) | 3990 | if (!trylock_page(page)) |
3391 | goto unlock_exit; | 3991 | goto unlock_exit; |
3392 | } else { | 3992 | } else { |
@@ -3430,7 +4030,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
3430 | if (bio) | 4030 | if (bio) |
3431 | submit_one_bio(READ, bio, mirror_num, bio_flags); | 4031 | submit_one_bio(READ, bio, mirror_num, bio_flags); |
3432 | 4032 | ||
3433 | if (ret || !wait) | 4033 | if (ret || wait != WAIT_COMPLETE) |
3434 | return ret; | 4034 | return ret; |
3435 | 4035 | ||
3436 | for (i = start_i; i < num_pages; i++) { | 4036 | for (i = start_i; i < num_pages; i++) { |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7b2f0c3e792..7604c300132 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -17,6 +17,8 @@ | |||
17 | #define EXTENT_NODATASUM (1 << 10) | 17 | #define EXTENT_NODATASUM (1 << 10) |
18 | #define EXTENT_DO_ACCOUNTING (1 << 11) | 18 | #define EXTENT_DO_ACCOUNTING (1 << 11) |
19 | #define EXTENT_FIRST_DELALLOC (1 << 12) | 19 | #define EXTENT_FIRST_DELALLOC (1 << 12) |
20 | #define EXTENT_NEED_WAIT (1 << 13) | ||
21 | #define EXTENT_DAMAGED (1 << 14) | ||
20 | #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) | 22 | #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) |
21 | #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) | 23 | #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) |
22 | 24 | ||
@@ -32,6 +34,7 @@ | |||
32 | #define EXTENT_BUFFER_BLOCKING 1 | 34 | #define EXTENT_BUFFER_BLOCKING 1 |
33 | #define EXTENT_BUFFER_DIRTY 2 | 35 | #define EXTENT_BUFFER_DIRTY 2 |
34 | #define EXTENT_BUFFER_CORRUPT 3 | 36 | #define EXTENT_BUFFER_CORRUPT 3 |
37 | #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ | ||
35 | 38 | ||
36 | /* these are flags for extent_clear_unlock_delalloc */ | 39 | /* these are flags for extent_clear_unlock_delalloc */ |
37 | #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 | 40 | #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 |
@@ -67,7 +70,7 @@ struct extent_io_ops { | |||
67 | unsigned long bio_flags); | 70 | unsigned long bio_flags); |
68 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); | 71 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); |
69 | int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, | 72 | int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, |
70 | u64 start, u64 end, | 73 | u64 start, u64 end, int failed_mirror, |
71 | struct extent_state *state); | 74 | struct extent_state *state); |
72 | int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, | 75 | int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, |
73 | u64 start, u64 end, | 76 | u64 start, u64 end, |
@@ -85,7 +88,8 @@ struct extent_io_ops { | |||
85 | struct extent_state *other); | 88 | struct extent_state *other); |
86 | void (*split_extent_hook)(struct inode *inode, | 89 | void (*split_extent_hook)(struct inode *inode, |
87 | struct extent_state *orig, u64 split); | 90 | struct extent_state *orig, u64 split); |
88 | int (*write_cache_pages_lock_hook)(struct page *page); | 91 | int (*write_cache_pages_lock_hook)(struct page *page, void *data, |
92 | void (*flush_fn)(void *)); | ||
89 | }; | 93 | }; |
90 | 94 | ||
91 | struct extent_io_tree { | 95 | struct extent_io_tree { |
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, | |||
185 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, | 189 | int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, |
186 | gfp_t mask); | 190 | gfp_t mask); |
187 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | 191 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, |
188 | get_extent_t *get_extent); | 192 | get_extent_t *get_extent, int mirror_num); |
189 | int __init extent_io_init(void); | 193 | int __init extent_io_init(void); |
190 | void extent_io_exit(void); | 194 | void extent_io_exit(void); |
191 | 195 | ||
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | |||
214 | gfp_t mask); | 218 | gfp_t mask); |
215 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | 219 | int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
216 | gfp_t mask); | 220 | gfp_t mask); |
221 | int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
222 | int bits, int clear_bits, gfp_t mask); | ||
217 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | 223 | int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, |
218 | struct extent_state **cached_state, gfp_t mask); | 224 | struct extent_state **cached_state, gfp_t mask); |
219 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | 225 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, |
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | |||
248 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, | 254 | struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, |
249 | u64 start, unsigned long len); | 255 | u64 start, unsigned long len); |
250 | void free_extent_buffer(struct extent_buffer *eb); | 256 | void free_extent_buffer(struct extent_buffer *eb); |
257 | #define WAIT_NONE 0 | ||
258 | #define WAIT_COMPLETE 1 | ||
259 | #define WAIT_PAGE_LOCK 2 | ||
251 | int read_extent_buffer_pages(struct extent_io_tree *tree, | 260 | int read_extent_buffer_pages(struct extent_io_tree *tree, |
252 | struct extent_buffer *eb, u64 start, int wait, | 261 | struct extent_buffer *eb, u64 start, int wait, |
253 | get_extent_t *get_extent, int mirror_num); | 262 | get_extent_t *get_extent, int mirror_num); |
263 | unsigned long num_extent_pages(u64 start, u64 len); | ||
264 | struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i); | ||
254 | 265 | ||
255 | static inline void extent_buffer_get(struct extent_buffer *eb) | 266 | static inline void extent_buffer_get(struct extent_buffer *eb) |
256 | { | 267 | { |
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, | |||
300 | struct bio * | 311 | struct bio * |
301 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | 312 | btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, |
302 | gfp_t gfp_flags); | 313 | gfp_t gfp_flags); |
314 | |||
315 | struct btrfs_mapping_tree; | ||
316 | |||
317 | int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | ||
318 | u64 length, u64 logical, struct page *page, | ||
319 | int mirror_num); | ||
303 | #endif | 320 | #endif |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a1cb7821bec..c7fb3a4247d 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, | |||
91 | struct btrfs_csum_item *item; | 91 | struct btrfs_csum_item *item; |
92 | struct extent_buffer *leaf; | 92 | struct extent_buffer *leaf; |
93 | u64 csum_offset = 0; | 93 | u64 csum_offset = 0; |
94 | u16 csum_size = | 94 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
95 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
96 | int csums_in_item; | 95 | int csums_in_item; |
97 | 96 | ||
98 | file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | 97 | file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; |
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, | |||
162 | u64 item_last_offset = 0; | 161 | u64 item_last_offset = 0; |
163 | u64 disk_bytenr; | 162 | u64 disk_bytenr; |
164 | u32 diff; | 163 | u32 diff; |
165 | u16 csum_size = | 164 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
166 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
167 | int ret; | 165 | int ret; |
168 | struct btrfs_path *path; | 166 | struct btrfs_path *path; |
169 | struct btrfs_csum_item *item = NULL; | 167 | struct btrfs_csum_item *item = NULL; |
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, | |||
290 | int ret; | 288 | int ret; |
291 | size_t size; | 289 | size_t size; |
292 | u64 csum_end; | 290 | u64 csum_end; |
293 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); | 291 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
294 | 292 | ||
295 | path = btrfs_alloc_path(); | 293 | path = btrfs_alloc_path(); |
296 | if (!path) | 294 | if (!path) |
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, | |||
492 | u64 bytenr, u64 len) | 490 | u64 bytenr, u64 len) |
493 | { | 491 | { |
494 | struct extent_buffer *leaf; | 492 | struct extent_buffer *leaf; |
495 | u16 csum_size = | 493 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
496 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
497 | u64 csum_end; | 494 | u64 csum_end; |
498 | u64 end_byte = bytenr + len; | 495 | u64 end_byte = bytenr + len; |
499 | u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; | 496 | u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; |
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, | |||
549 | u64 csum_end; | 546 | u64 csum_end; |
550 | struct extent_buffer *leaf; | 547 | struct extent_buffer *leaf; |
551 | int ret; | 548 | int ret; |
552 | u16 csum_size = | 549 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
553 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
554 | int blocksize_bits = root->fs_info->sb->s_blocksize_bits; | 550 | int blocksize_bits = root->fs_info->sb->s_blocksize_bits; |
555 | 551 | ||
556 | root = root->fs_info->csum_root; | 552 | root = root->fs_info->csum_root; |
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | |||
676 | struct btrfs_sector_sum *sector_sum; | 672 | struct btrfs_sector_sum *sector_sum; |
677 | u32 nritems; | 673 | u32 nritems; |
678 | u32 ins_size; | 674 | u32 ins_size; |
679 | u16 csum_size = | 675 | u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); |
680 | btrfs_super_csum_size(&root->fs_info->super_copy); | ||
681 | 676 | ||
682 | path = btrfs_alloc_path(); | 677 | path = btrfs_alloc_path(); |
683 | if (!path) | 678 | if (!path) |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1266f6e9cdb..dafdfa059bf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, | |||
1069 | int i; | 1069 | int i; |
1070 | unsigned long index = pos >> PAGE_CACHE_SHIFT; | 1070 | unsigned long index = pos >> PAGE_CACHE_SHIFT; |
1071 | struct inode *inode = fdentry(file)->d_inode; | 1071 | struct inode *inode = fdentry(file)->d_inode; |
1072 | gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); | ||
1072 | int err = 0; | 1073 | int err = 0; |
1073 | int faili = 0; | 1074 | int faili = 0; |
1074 | u64 start_pos; | 1075 | u64 start_pos; |
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, | |||
1080 | again: | 1081 | again: |
1081 | for (i = 0; i < num_pages; i++) { | 1082 | for (i = 0; i < num_pages; i++) { |
1082 | pages[i] = find_or_create_page(inode->i_mapping, index + i, | 1083 | pages[i] = find_or_create_page(inode->i_mapping, index + i, |
1083 | GFP_NOFS); | 1084 | mask); |
1084 | if (!pages[i]) { | 1085 | if (!pages[i]) { |
1085 | faili = i - 1; | 1086 | faili = i - 1; |
1086 | err = -ENOMEM; | 1087 | err = -ENOMEM; |
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
1615 | goto out; | 1616 | goto out; |
1616 | } | 1617 | } |
1617 | 1618 | ||
1618 | ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); | ||
1619 | if (ret) | ||
1620 | goto out; | ||
1621 | |||
1622 | locked_end = alloc_end - 1; | 1619 | locked_end = alloc_end - 1; |
1623 | while (1) { | 1620 | while (1) { |
1624 | struct btrfs_ordered_extent *ordered; | 1621 | struct btrfs_ordered_extent *ordered; |
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
1664 | if (em->block_start == EXTENT_MAP_HOLE || | 1661 | if (em->block_start == EXTENT_MAP_HOLE || |
1665 | (cur_offset >= inode->i_size && | 1662 | (cur_offset >= inode->i_size && |
1666 | !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { | 1663 | !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { |
1664 | |||
1665 | /* | ||
1666 | * Make sure we have enough space before we do the | ||
1667 | * allocation. | ||
1668 | */ | ||
1669 | ret = btrfs_check_data_free_space(inode, last_byte - | ||
1670 | cur_offset); | ||
1671 | if (ret) { | ||
1672 | free_extent_map(em); | ||
1673 | break; | ||
1674 | } | ||
1675 | |||
1667 | ret = btrfs_prealloc_file_range(inode, mode, cur_offset, | 1676 | ret = btrfs_prealloc_file_range(inode, mode, cur_offset, |
1668 | last_byte - cur_offset, | 1677 | last_byte - cur_offset, |
1669 | 1 << inode->i_blkbits, | 1678 | 1 << inode->i_blkbits, |
1670 | offset + len, | 1679 | offset + len, |
1671 | &alloc_hint); | 1680 | &alloc_hint); |
1681 | |||
1682 | /* Let go of our reservation. */ | ||
1683 | btrfs_free_reserved_data_space(inode, last_byte - | ||
1684 | cur_offset); | ||
1672 | if (ret < 0) { | 1685 | if (ret < 0) { |
1673 | free_extent_map(em); | 1686 | free_extent_map(em); |
1674 | break; | 1687 | break; |
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode, | |||
1694 | } | 1707 | } |
1695 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, | 1708 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, |
1696 | &cached_state, GFP_NOFS); | 1709 | &cached_state, GFP_NOFS); |
1697 | |||
1698 | btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); | ||
1699 | out: | 1710 | out: |
1700 | mutex_unlock(&inode->i_mutex); | 1711 | mutex_unlock(&inode->i_mutex); |
1701 | return ret; | 1712 | return ret; |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 41ac927401d..ec23d43d0c3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | #include <linux/math64.h> | 22 | #include <linux/math64.h> |
23 | #include <linux/ratelimit.h> | ||
23 | #include "ctree.h" | 24 | #include "ctree.h" |
24 | #include "free-space-cache.h" | 25 | #include "free-space-cache.h" |
25 | #include "transaction.h" | 26 | #include "transaction.h" |
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, | |||
84 | *block_group, struct btrfs_path *path) | 85 | *block_group, struct btrfs_path *path) |
85 | { | 86 | { |
86 | struct inode *inode = NULL; | 87 | struct inode *inode = NULL; |
88 | u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; | ||
87 | 89 | ||
88 | spin_lock(&block_group->lock); | 90 | spin_lock(&block_group->lock); |
89 | if (block_group->inode) | 91 | if (block_group->inode) |
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, | |||
98 | return inode; | 100 | return inode; |
99 | 101 | ||
100 | spin_lock(&block_group->lock); | 102 | spin_lock(&block_group->lock); |
101 | if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { | 103 | if (!((BTRFS_I(inode)->flags & flags) == flags)) { |
102 | printk(KERN_INFO "Old style space inode found, converting.\n"); | 104 | printk(KERN_INFO "Old style space inode found, converting.\n"); |
103 | BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; | 105 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | |
106 | BTRFS_INODE_NODATACOW; | ||
104 | block_group->disk_cache_state = BTRFS_DC_CLEAR; | 107 | block_group->disk_cache_state = BTRFS_DC_CLEAR; |
105 | } | 108 | } |
106 | 109 | ||
107 | if (!btrfs_fs_closing(root->fs_info)) { | 110 | if (!block_group->iref) { |
108 | block_group->inode = igrab(inode); | 111 | block_group->inode = igrab(inode); |
109 | block_group->iref = 1; | 112 | block_group->iref = 1; |
110 | } | 113 | } |
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root, | |||
122 | struct btrfs_free_space_header *header; | 125 | struct btrfs_free_space_header *header; |
123 | struct btrfs_inode_item *inode_item; | 126 | struct btrfs_inode_item *inode_item; |
124 | struct extent_buffer *leaf; | 127 | struct extent_buffer *leaf; |
128 | u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; | ||
125 | int ret; | 129 | int ret; |
126 | 130 | ||
127 | ret = btrfs_insert_empty_inode(trans, root, path, ino); | 131 | ret = btrfs_insert_empty_inode(trans, root, path, ino); |
128 | if (ret) | 132 | if (ret) |
129 | return ret; | 133 | return ret; |
130 | 134 | ||
135 | /* We inline crc's for the free disk space cache */ | ||
136 | if (ino != BTRFS_FREE_INO_OBJECTID) | ||
137 | flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; | ||
138 | |||
131 | leaf = path->nodes[0]; | 139 | leaf = path->nodes[0]; |
132 | inode_item = btrfs_item_ptr(leaf, path->slots[0], | 140 | inode_item = btrfs_item_ptr(leaf, path->slots[0], |
133 | struct btrfs_inode_item); | 141 | struct btrfs_inode_item); |
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root, | |||
140 | btrfs_set_inode_uid(leaf, inode_item, 0); | 148 | btrfs_set_inode_uid(leaf, inode_item, 0); |
141 | btrfs_set_inode_gid(leaf, inode_item, 0); | 149 | btrfs_set_inode_gid(leaf, inode_item, 0); |
142 | btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); | 150 | btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); |
143 | btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | | 151 | btrfs_set_inode_flags(leaf, inode_item, flags); |
144 | BTRFS_INODE_PREALLOC); | ||
145 | btrfs_set_inode_nlink(leaf, inode_item, 1); | 152 | btrfs_set_inode_nlink(leaf, inode_item, 1); |
146 | btrfs_set_inode_transid(leaf, inode_item, trans->transid); | 153 | btrfs_set_inode_transid(leaf, inode_item, trans->transid); |
147 | btrfs_set_inode_block_group(leaf, inode_item, offset); | 154 | btrfs_set_inode_block_group(leaf, inode_item, offset); |
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, | |||
191 | struct inode *inode) | 198 | struct inode *inode) |
192 | { | 199 | { |
193 | struct btrfs_block_rsv *rsv; | 200 | struct btrfs_block_rsv *rsv; |
201 | u64 needed_bytes; | ||
194 | loff_t oldsize; | 202 | loff_t oldsize; |
195 | int ret = 0; | 203 | int ret = 0; |
196 | 204 | ||
197 | rsv = trans->block_rsv; | 205 | rsv = trans->block_rsv; |
198 | trans->block_rsv = root->orphan_block_rsv; | 206 | trans->block_rsv = &root->fs_info->global_block_rsv; |
199 | ret = btrfs_block_rsv_check(trans, root, | 207 | |
200 | root->orphan_block_rsv, | 208 | /* 1 for slack space, 1 for updating the inode */ |
201 | 0, 5); | 209 | needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + |
202 | if (ret) | 210 | btrfs_calc_trans_metadata_size(root, 1); |
203 | return ret; | 211 | |
212 | spin_lock(&trans->block_rsv->lock); | ||
213 | if (trans->block_rsv->reserved < needed_bytes) { | ||
214 | spin_unlock(&trans->block_rsv->lock); | ||
215 | trans->block_rsv = rsv; | ||
216 | return -ENOSPC; | ||
217 | } | ||
218 | spin_unlock(&trans->block_rsv->lock); | ||
204 | 219 | ||
205 | oldsize = i_size_read(inode); | 220 | oldsize = i_size_read(inode); |
206 | btrfs_i_size_write(inode, 0); | 221 | btrfs_i_size_write(inode, 0); |
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, | |||
213 | ret = btrfs_truncate_inode_items(trans, root, inode, | 228 | ret = btrfs_truncate_inode_items(trans, root, inode, |
214 | 0, BTRFS_EXTENT_DATA_KEY); | 229 | 0, BTRFS_EXTENT_DATA_KEY); |
215 | 230 | ||
216 | trans->block_rsv = rsv; | ||
217 | if (ret) { | 231 | if (ret) { |
232 | trans->block_rsv = rsv; | ||
218 | WARN_ON(1); | 233 | WARN_ON(1); |
219 | return ret; | 234 | return ret; |
220 | } | 235 | } |
221 | 236 | ||
222 | ret = btrfs_update_inode(trans, root, inode); | 237 | ret = btrfs_update_inode(trans, root, inode); |
238 | trans->block_rsv = rsv; | ||
239 | |||
223 | return ret; | 240 | return ret; |
224 | } | 241 | } |
225 | 242 | ||
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode) | |||
242 | return 0; | 259 | return 0; |
243 | } | 260 | } |
244 | 261 | ||
262 | struct io_ctl { | ||
263 | void *cur, *orig; | ||
264 | struct page *page; | ||
265 | struct page **pages; | ||
266 | struct btrfs_root *root; | ||
267 | unsigned long size; | ||
268 | int index; | ||
269 | int num_pages; | ||
270 | unsigned check_crcs:1; | ||
271 | }; | ||
272 | |||
273 | static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, | ||
274 | struct btrfs_root *root) | ||
275 | { | ||
276 | memset(io_ctl, 0, sizeof(struct io_ctl)); | ||
277 | io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> | ||
278 | PAGE_CACHE_SHIFT; | ||
279 | io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages, | ||
280 | GFP_NOFS); | ||
281 | if (!io_ctl->pages) | ||
282 | return -ENOMEM; | ||
283 | io_ctl->root = root; | ||
284 | if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) | ||
285 | io_ctl->check_crcs = 1; | ||
286 | return 0; | ||
287 | } | ||
288 | |||
289 | static void io_ctl_free(struct io_ctl *io_ctl) | ||
290 | { | ||
291 | kfree(io_ctl->pages); | ||
292 | } | ||
293 | |||
294 | static void io_ctl_unmap_page(struct io_ctl *io_ctl) | ||
295 | { | ||
296 | if (io_ctl->cur) { | ||
297 | kunmap(io_ctl->page); | ||
298 | io_ctl->cur = NULL; | ||
299 | io_ctl->orig = NULL; | ||
300 | } | ||
301 | } | ||
302 | |||
303 | static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) | ||
304 | { | ||
305 | WARN_ON(io_ctl->cur); | ||
306 | BUG_ON(io_ctl->index >= io_ctl->num_pages); | ||
307 | io_ctl->page = io_ctl->pages[io_ctl->index++]; | ||
308 | io_ctl->cur = kmap(io_ctl->page); | ||
309 | io_ctl->orig = io_ctl->cur; | ||
310 | io_ctl->size = PAGE_CACHE_SIZE; | ||
311 | if (clear) | ||
312 | memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); | ||
313 | } | ||
314 | |||
315 | static void io_ctl_drop_pages(struct io_ctl *io_ctl) | ||
316 | { | ||
317 | int i; | ||
318 | |||
319 | io_ctl_unmap_page(io_ctl); | ||
320 | |||
321 | for (i = 0; i < io_ctl->num_pages; i++) { | ||
322 | ClearPageChecked(io_ctl->pages[i]); | ||
323 | unlock_page(io_ctl->pages[i]); | ||
324 | page_cache_release(io_ctl->pages[i]); | ||
325 | } | ||
326 | } | ||
327 | |||
328 | static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, | ||
329 | int uptodate) | ||
330 | { | ||
331 | struct page *page; | ||
332 | gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); | ||
333 | int i; | ||
334 | |||
335 | for (i = 0; i < io_ctl->num_pages; i++) { | ||
336 | page = find_or_create_page(inode->i_mapping, i, mask); | ||
337 | if (!page) { | ||
338 | io_ctl_drop_pages(io_ctl); | ||
339 | return -ENOMEM; | ||
340 | } | ||
341 | io_ctl->pages[i] = page; | ||
342 | if (uptodate && !PageUptodate(page)) { | ||
343 | btrfs_readpage(NULL, page); | ||
344 | lock_page(page); | ||
345 | if (!PageUptodate(page)) { | ||
346 | printk(KERN_ERR "btrfs: error reading free " | ||
347 | "space cache\n"); | ||
348 | io_ctl_drop_pages(io_ctl); | ||
349 | return -EIO; | ||
350 | } | ||
351 | } | ||
352 | } | ||
353 | |||
354 | for (i = 0; i < io_ctl->num_pages; i++) { | ||
355 | clear_page_dirty_for_io(io_ctl->pages[i]); | ||
356 | set_page_extent_mapped(io_ctl->pages[i]); | ||
357 | } | ||
358 | |||
359 | return 0; | ||
360 | } | ||
361 | |||
362 | static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) | ||
363 | { | ||
364 | u64 *val; | ||
365 | |||
366 | io_ctl_map_page(io_ctl, 1); | ||
367 | |||
368 | /* | ||
369 | * Skip the csum areas. If we don't check crcs then we just have a | ||
370 | * 64bit chunk at the front of the first page. | ||
371 | */ | ||
372 | if (io_ctl->check_crcs) { | ||
373 | io_ctl->cur += (sizeof(u32) * io_ctl->num_pages); | ||
374 | io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages); | ||
375 | } else { | ||
376 | io_ctl->cur += sizeof(u64); | ||
377 | io_ctl->size -= sizeof(u64) * 2; | ||
378 | } | ||
379 | |||
380 | val = io_ctl->cur; | ||
381 | *val = cpu_to_le64(generation); | ||
382 | io_ctl->cur += sizeof(u64); | ||
383 | } | ||
384 | |||
385 | static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) | ||
386 | { | ||
387 | u64 *gen; | ||
388 | |||
389 | /* | ||
390 | * Skip the crc area. If we don't check crcs then we just have a 64bit | ||
391 | * chunk at the front of the first page. | ||
392 | */ | ||
393 | if (io_ctl->check_crcs) { | ||
394 | io_ctl->cur += sizeof(u32) * io_ctl->num_pages; | ||
395 | io_ctl->size -= sizeof(u64) + | ||
396 | (sizeof(u32) * io_ctl->num_pages); | ||
397 | } else { | ||
398 | io_ctl->cur += sizeof(u64); | ||
399 | io_ctl->size -= sizeof(u64) * 2; | ||
400 | } | ||
401 | |||
402 | gen = io_ctl->cur; | ||
403 | if (le64_to_cpu(*gen) != generation) { | ||
404 | printk_ratelimited(KERN_ERR "btrfs: space cache generation " | ||
405 | "(%Lu) does not match inode (%Lu)\n", *gen, | ||
406 | generation); | ||
407 | io_ctl_unmap_page(io_ctl); | ||
408 | return -EIO; | ||
409 | } | ||
410 | io_ctl->cur += sizeof(u64); | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) | ||
415 | { | ||
416 | u32 *tmp; | ||
417 | u32 crc = ~(u32)0; | ||
418 | unsigned offset = 0; | ||
419 | |||
420 | if (!io_ctl->check_crcs) { | ||
421 | io_ctl_unmap_page(io_ctl); | ||
422 | return; | ||
423 | } | ||
424 | |||
425 | if (index == 0) | ||
426 | offset = sizeof(u32) * io_ctl->num_pages;; | ||
427 | |||
428 | crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, | ||
429 | PAGE_CACHE_SIZE - offset); | ||
430 | btrfs_csum_final(crc, (char *)&crc); | ||
431 | io_ctl_unmap_page(io_ctl); | ||
432 | tmp = kmap(io_ctl->pages[0]); | ||
433 | tmp += index; | ||
434 | *tmp = crc; | ||
435 | kunmap(io_ctl->pages[0]); | ||
436 | } | ||
437 | |||
438 | static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) | ||
439 | { | ||
440 | u32 *tmp, val; | ||
441 | u32 crc = ~(u32)0; | ||
442 | unsigned offset = 0; | ||
443 | |||
444 | if (!io_ctl->check_crcs) { | ||
445 | io_ctl_map_page(io_ctl, 0); | ||
446 | return 0; | ||
447 | } | ||
448 | |||
449 | if (index == 0) | ||
450 | offset = sizeof(u32) * io_ctl->num_pages; | ||
451 | |||
452 | tmp = kmap(io_ctl->pages[0]); | ||
453 | tmp += index; | ||
454 | val = *tmp; | ||
455 | kunmap(io_ctl->pages[0]); | ||
456 | |||
457 | io_ctl_map_page(io_ctl, 0); | ||
458 | crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc, | ||
459 | PAGE_CACHE_SIZE - offset); | ||
460 | btrfs_csum_final(crc, (char *)&crc); | ||
461 | if (val != crc) { | ||
462 | printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free " | ||
463 | "space cache\n"); | ||
464 | io_ctl_unmap_page(io_ctl); | ||
465 | return -EIO; | ||
466 | } | ||
467 | |||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, | ||
472 | void *bitmap) | ||
473 | { | ||
474 | struct btrfs_free_space_entry *entry; | ||
475 | |||
476 | if (!io_ctl->cur) | ||
477 | return -ENOSPC; | ||
478 | |||
479 | entry = io_ctl->cur; | ||
480 | entry->offset = cpu_to_le64(offset); | ||
481 | entry->bytes = cpu_to_le64(bytes); | ||
482 | entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : | ||
483 | BTRFS_FREE_SPACE_EXTENT; | ||
484 | io_ctl->cur += sizeof(struct btrfs_free_space_entry); | ||
485 | io_ctl->size -= sizeof(struct btrfs_free_space_entry); | ||
486 | |||
487 | if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) | ||
488 | return 0; | ||
489 | |||
490 | io_ctl_set_crc(io_ctl, io_ctl->index - 1); | ||
491 | |||
492 | /* No more pages to map */ | ||
493 | if (io_ctl->index >= io_ctl->num_pages) | ||
494 | return 0; | ||
495 | |||
496 | /* map the next page */ | ||
497 | io_ctl_map_page(io_ctl, 1); | ||
498 | return 0; | ||
499 | } | ||
500 | |||
501 | static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) | ||
502 | { | ||
503 | if (!io_ctl->cur) | ||
504 | return -ENOSPC; | ||
505 | |||
506 | /* | ||
507 | * If we aren't at the start of the current page, unmap this one and | ||
508 | * map the next one if there is any left. | ||
509 | */ | ||
510 | if (io_ctl->cur != io_ctl->orig) { | ||
511 | io_ctl_set_crc(io_ctl, io_ctl->index - 1); | ||
512 | if (io_ctl->index >= io_ctl->num_pages) | ||
513 | return -ENOSPC; | ||
514 | io_ctl_map_page(io_ctl, 0); | ||
515 | } | ||
516 | |||
517 | memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE); | ||
518 | io_ctl_set_crc(io_ctl, io_ctl->index - 1); | ||
519 | if (io_ctl->index < io_ctl->num_pages) | ||
520 | io_ctl_map_page(io_ctl, 0); | ||
521 | return 0; | ||
522 | } | ||
523 | |||
524 | static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) | ||
525 | { | ||
526 | /* | ||
527 | * If we're not on the boundary we know we've modified the page and we | ||
528 | * need to crc the page. | ||
529 | */ | ||
530 | if (io_ctl->cur != io_ctl->orig) | ||
531 | io_ctl_set_crc(io_ctl, io_ctl->index - 1); | ||
532 | else | ||
533 | io_ctl_unmap_page(io_ctl); | ||
534 | |||
535 | while (io_ctl->index < io_ctl->num_pages) { | ||
536 | io_ctl_map_page(io_ctl, 1); | ||
537 | io_ctl_set_crc(io_ctl, io_ctl->index - 1); | ||
538 | } | ||
539 | } | ||
540 | |||
541 | static int io_ctl_read_entry(struct io_ctl *io_ctl, | ||
542 | struct btrfs_free_space *entry, u8 *type) | ||
543 | { | ||
544 | struct btrfs_free_space_entry *e; | ||
545 | int ret; | ||
546 | |||
547 | if (!io_ctl->cur) { | ||
548 | ret = io_ctl_check_crc(io_ctl, io_ctl->index); | ||
549 | if (ret) | ||
550 | return ret; | ||
551 | } | ||
552 | |||
553 | e = io_ctl->cur; | ||
554 | entry->offset = le64_to_cpu(e->offset); | ||
555 | entry->bytes = le64_to_cpu(e->bytes); | ||
556 | *type = e->type; | ||
557 | io_ctl->cur += sizeof(struct btrfs_free_space_entry); | ||
558 | io_ctl->size -= sizeof(struct btrfs_free_space_entry); | ||
559 | |||
560 | if (io_ctl->size >= sizeof(struct btrfs_free_space_entry)) | ||
561 | return 0; | ||
562 | |||
563 | io_ctl_unmap_page(io_ctl); | ||
564 | |||
565 | return 0; | ||
566 | } | ||
567 | |||
568 | static int io_ctl_read_bitmap(struct io_ctl *io_ctl, | ||
569 | struct btrfs_free_space *entry) | ||
570 | { | ||
571 | int ret; | ||
572 | |||
573 | ret = io_ctl_check_crc(io_ctl, io_ctl->index); | ||
574 | if (ret) | ||
575 | return ret; | ||
576 | |||
577 | memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE); | ||
578 | io_ctl_unmap_page(io_ctl); | ||
579 | |||
580 | return 0; | ||
581 | } | ||
582 | |||
245 | int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, | 583 | int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, |
246 | struct btrfs_free_space_ctl *ctl, | 584 | struct btrfs_free_space_ctl *ctl, |
247 | struct btrfs_path *path, u64 offset) | 585 | struct btrfs_path *path, u64 offset) |
248 | { | 586 | { |
249 | struct btrfs_free_space_header *header; | 587 | struct btrfs_free_space_header *header; |
250 | struct extent_buffer *leaf; | 588 | struct extent_buffer *leaf; |
251 | struct page *page; | 589 | struct io_ctl io_ctl; |
252 | struct btrfs_key key; | 590 | struct btrfs_key key; |
591 | struct btrfs_free_space *e, *n; | ||
253 | struct list_head bitmaps; | 592 | struct list_head bitmaps; |
254 | u64 num_entries; | 593 | u64 num_entries; |
255 | u64 num_bitmaps; | 594 | u64 num_bitmaps; |
256 | u64 generation; | 595 | u64 generation; |
257 | pgoff_t index = 0; | 596 | u8 type; |
258 | int ret = 0; | 597 | int ret = 0; |
259 | 598 | ||
260 | INIT_LIST_HEAD(&bitmaps); | 599 | INIT_LIST_HEAD(&bitmaps); |
261 | 600 | ||
262 | /* Nothing in the space cache, goodbye */ | 601 | /* Nothing in the space cache, goodbye */ |
263 | if (!i_size_read(inode)) | 602 | if (!i_size_read(inode)) |
264 | goto out; | 603 | return 0; |
265 | 604 | ||
266 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; | 605 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; |
267 | key.offset = offset; | 606 | key.offset = offset; |
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, | |||
269 | 608 | ||
270 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | 609 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); |
271 | if (ret < 0) | 610 | if (ret < 0) |
272 | goto out; | 611 | return 0; |
273 | else if (ret > 0) { | 612 | else if (ret > 0) { |
274 | btrfs_release_path(path); | 613 | btrfs_release_path(path); |
275 | ret = 0; | 614 | return 0; |
276 | goto out; | ||
277 | } | 615 | } |
278 | 616 | ||
279 | ret = -1; | 617 | ret = -1; |
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, | |||
291 | " not match free space cache generation (%llu)\n", | 629 | " not match free space cache generation (%llu)\n", |
292 | (unsigned long long)BTRFS_I(inode)->generation, | 630 | (unsigned long long)BTRFS_I(inode)->generation, |
293 | (unsigned long long)generation); | 631 | (unsigned long long)generation); |
294 | goto out; | 632 | return 0; |
295 | } | 633 | } |
296 | 634 | ||
297 | if (!num_entries) | 635 | if (!num_entries) |
298 | goto out; | 636 | return 0; |
299 | 637 | ||
638 | io_ctl_init(&io_ctl, inode, root); | ||
300 | ret = readahead_cache(inode); | 639 | ret = readahead_cache(inode); |
301 | if (ret) | 640 | if (ret) |
302 | goto out; | 641 | goto out; |
303 | 642 | ||
304 | while (1) { | 643 | ret = io_ctl_prepare_pages(&io_ctl, inode, 1); |
305 | struct btrfs_free_space_entry *entry; | 644 | if (ret) |
306 | struct btrfs_free_space *e; | 645 | goto out; |
307 | void *addr; | ||
308 | unsigned long offset = 0; | ||
309 | int need_loop = 0; | ||
310 | 646 | ||
311 | if (!num_entries && !num_bitmaps) | 647 | ret = io_ctl_check_crc(&io_ctl, 0); |
312 | break; | 648 | if (ret) |
649 | goto free_cache; | ||
650 | |||
651 | ret = io_ctl_check_generation(&io_ctl, generation); | ||
652 | if (ret) | ||
653 | goto free_cache; | ||
313 | 654 | ||
314 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | 655 | while (num_entries) { |
315 | if (!page) | 656 | e = kmem_cache_zalloc(btrfs_free_space_cachep, |
657 | GFP_NOFS); | ||
658 | if (!e) | ||
316 | goto free_cache; | 659 | goto free_cache; |
317 | 660 | ||
318 | if (!PageUptodate(page)) { | 661 | ret = io_ctl_read_entry(&io_ctl, e, &type); |
319 | btrfs_readpage(NULL, page); | 662 | if (ret) { |
320 | lock_page(page); | 663 | kmem_cache_free(btrfs_free_space_cachep, e); |
321 | if (!PageUptodate(page)) { | 664 | goto free_cache; |
322 | unlock_page(page); | ||
323 | page_cache_release(page); | ||
324 | printk(KERN_ERR "btrfs: error reading free " | ||
325 | "space cache\n"); | ||
326 | goto free_cache; | ||
327 | } | ||
328 | } | 665 | } |
329 | addr = kmap(page); | ||
330 | 666 | ||
331 | if (index == 0) { | 667 | if (!e->bytes) { |
332 | u64 *gen; | 668 | kmem_cache_free(btrfs_free_space_cachep, e); |
669 | goto free_cache; | ||
670 | } | ||
333 | 671 | ||
334 | /* | 672 | if (type == BTRFS_FREE_SPACE_EXTENT) { |
335 | * We put a bogus crc in the front of the first page in | 673 | spin_lock(&ctl->tree_lock); |
336 | * case old kernels try to mount a fs with the new | 674 | ret = link_free_space(ctl, e); |
337 | * format to make sure they discard the cache. | 675 | spin_unlock(&ctl->tree_lock); |
338 | */ | 676 | if (ret) { |
339 | addr += sizeof(u64); | 677 | printk(KERN_ERR "Duplicate entries in " |
340 | offset += sizeof(u64); | 678 | "free space cache, dumping\n"); |
341 | 679 | kmem_cache_free(btrfs_free_space_cachep, e); | |
342 | gen = addr; | ||
343 | if (*gen != BTRFS_I(inode)->generation) { | ||
344 | printk(KERN_ERR "btrfs: space cache generation" | ||
345 | " (%llu) does not match inode (%llu)\n", | ||
346 | (unsigned long long)*gen, | ||
347 | (unsigned long long) | ||
348 | BTRFS_I(inode)->generation); | ||
349 | kunmap(page); | ||
350 | unlock_page(page); | ||
351 | page_cache_release(page); | ||
352 | goto free_cache; | 680 | goto free_cache; |
353 | } | 681 | } |
354 | addr += sizeof(u64); | 682 | } else { |
355 | offset += sizeof(u64); | 683 | BUG_ON(!num_bitmaps); |
356 | } | 684 | num_bitmaps--; |
357 | entry = addr; | 685 | e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); |
358 | 686 | if (!e->bitmap) { | |
359 | while (1) { | 687 | kmem_cache_free( |
360 | if (!num_entries) | 688 | btrfs_free_space_cachep, e); |
361 | break; | ||
362 | |||
363 | need_loop = 1; | ||
364 | e = kmem_cache_zalloc(btrfs_free_space_cachep, | ||
365 | GFP_NOFS); | ||
366 | if (!e) { | ||
367 | kunmap(page); | ||
368 | unlock_page(page); | ||
369 | page_cache_release(page); | ||
370 | goto free_cache; | 689 | goto free_cache; |
371 | } | 690 | } |
372 | 691 | spin_lock(&ctl->tree_lock); | |
373 | e->offset = le64_to_cpu(entry->offset); | 692 | ret = link_free_space(ctl, e); |
374 | e->bytes = le64_to_cpu(entry->bytes); | 693 | ctl->total_bitmaps++; |
375 | if (!e->bytes) { | 694 | ctl->op->recalc_thresholds(ctl); |
376 | kunmap(page); | 695 | spin_unlock(&ctl->tree_lock); |
696 | if (ret) { | ||
697 | printk(KERN_ERR "Duplicate entries in " | ||
698 | "free space cache, dumping\n"); | ||
377 | kmem_cache_free(btrfs_free_space_cachep, e); | 699 | kmem_cache_free(btrfs_free_space_cachep, e); |
378 | unlock_page(page); | ||
379 | page_cache_release(page); | ||
380 | goto free_cache; | 700 | goto free_cache; |
381 | } | 701 | } |
382 | 702 | list_add_tail(&e->list, &bitmaps); | |
383 | if (entry->type == BTRFS_FREE_SPACE_EXTENT) { | ||
384 | spin_lock(&ctl->tree_lock); | ||
385 | ret = link_free_space(ctl, e); | ||
386 | spin_unlock(&ctl->tree_lock); | ||
387 | if (ret) { | ||
388 | printk(KERN_ERR "Duplicate entries in " | ||
389 | "free space cache, dumping\n"); | ||
390 | kunmap(page); | ||
391 | unlock_page(page); | ||
392 | page_cache_release(page); | ||
393 | goto free_cache; | ||
394 | } | ||
395 | } else { | ||
396 | e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); | ||
397 | if (!e->bitmap) { | ||
398 | kunmap(page); | ||
399 | kmem_cache_free( | ||
400 | btrfs_free_space_cachep, e); | ||
401 | unlock_page(page); | ||
402 | page_cache_release(page); | ||
403 | goto free_cache; | ||
404 | } | ||
405 | spin_lock(&ctl->tree_lock); | ||
406 | ret = link_free_space(ctl, e); | ||
407 | ctl->total_bitmaps++; | ||
408 | ctl->op->recalc_thresholds(ctl); | ||
409 | spin_unlock(&ctl->tree_lock); | ||
410 | if (ret) { | ||
411 | printk(KERN_ERR "Duplicate entries in " | ||
412 | "free space cache, dumping\n"); | ||
413 | kunmap(page); | ||
414 | unlock_page(page); | ||
415 | page_cache_release(page); | ||
416 | goto free_cache; | ||
417 | } | ||
418 | list_add_tail(&e->list, &bitmaps); | ||
419 | } | ||
420 | |||
421 | num_entries--; | ||
422 | offset += sizeof(struct btrfs_free_space_entry); | ||
423 | if (offset + sizeof(struct btrfs_free_space_entry) >= | ||
424 | PAGE_CACHE_SIZE) | ||
425 | break; | ||
426 | entry++; | ||
427 | } | 703 | } |
428 | 704 | ||
429 | /* | 705 | num_entries--; |
430 | * We read an entry out of this page, we need to move on to the | 706 | } |
431 | * next page. | ||
432 | */ | ||
433 | if (need_loop) { | ||
434 | kunmap(page); | ||
435 | goto next; | ||
436 | } | ||
437 | 707 | ||
438 | /* | 708 | io_ctl_unmap_page(&io_ctl); |
439 | * We add the bitmaps at the end of the entries in order that | 709 | |
440 | * the bitmap entries are added to the cache. | 710 | /* |
441 | */ | 711 | * We add the bitmaps at the end of the entries in order that |
442 | e = list_entry(bitmaps.next, struct btrfs_free_space, list); | 712 | * the bitmap entries are added to the cache. |
713 | */ | ||
714 | list_for_each_entry_safe(e, n, &bitmaps, list) { | ||
443 | list_del_init(&e->list); | 715 | list_del_init(&e->list); |
444 | memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); | 716 | ret = io_ctl_read_bitmap(&io_ctl, e); |
445 | kunmap(page); | 717 | if (ret) |
446 | num_bitmaps--; | 718 | goto free_cache; |
447 | next: | ||
448 | unlock_page(page); | ||
449 | page_cache_release(page); | ||
450 | index++; | ||
451 | } | 719 | } |
452 | 720 | ||
721 | io_ctl_drop_pages(&io_ctl); | ||
453 | ret = 1; | 722 | ret = 1; |
454 | out: | 723 | out: |
724 | io_ctl_free(&io_ctl); | ||
455 | return ret; | 725 | return ret; |
456 | free_cache: | 726 | free_cache: |
727 | io_ctl_drop_pages(&io_ctl); | ||
457 | __btrfs_remove_free_space_cache(ctl); | 728 | __btrfs_remove_free_space_cache(ctl); |
458 | goto out; | 729 | goto out; |
459 | } | 730 | } |
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, | |||
465 | struct btrfs_root *root = fs_info->tree_root; | 736 | struct btrfs_root *root = fs_info->tree_root; |
466 | struct inode *inode; | 737 | struct inode *inode; |
467 | struct btrfs_path *path; | 738 | struct btrfs_path *path; |
468 | int ret; | 739 | int ret = 0; |
469 | bool matched; | 740 | bool matched; |
470 | u64 used = btrfs_block_group_used(&block_group->item); | 741 | u64 used = btrfs_block_group_used(&block_group->item); |
471 | 742 | ||
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, | |||
497 | return 0; | 768 | return 0; |
498 | } | 769 | } |
499 | 770 | ||
771 | /* We may have converted the inode and made the cache invalid. */ | ||
772 | spin_lock(&block_group->lock); | ||
773 | if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { | ||
774 | spin_unlock(&block_group->lock); | ||
775 | goto out; | ||
776 | } | ||
777 | spin_unlock(&block_group->lock); | ||
778 | |||
500 | ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, | 779 | ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, |
501 | path, block_group->key.objectid); | 780 | path, block_group->key.objectid); |
502 | btrfs_free_path(path); | 781 | btrfs_free_path(path); |
@@ -530,6 +809,19 @@ out: | |||
530 | return ret; | 809 | return ret; |
531 | } | 810 | } |
532 | 811 | ||
812 | /** | ||
813 | * __btrfs_write_out_cache - write out cached info to an inode | ||
814 | * @root - the root the inode belongs to | ||
815 | * @ctl - the free space cache we are going to write out | ||
816 | * @block_group - the block_group for this cache if it belongs to a block_group | ||
817 | * @trans - the trans handle | ||
818 | * @path - the path to use | ||
819 | * @offset - the offset for the key we'll insert | ||
820 | * | ||
821 | * This function writes out a free space cache struct to disk for quick recovery | ||
822 | * on mount. This will return 0 if it was successfull in writing the cache out, | ||
823 | * and -1 if it was not. | ||
824 | */ | ||
533 | int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | 825 | int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, |
534 | struct btrfs_free_space_ctl *ctl, | 826 | struct btrfs_free_space_ctl *ctl, |
535 | struct btrfs_block_group_cache *block_group, | 827 | struct btrfs_block_group_cache *block_group, |
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
540 | struct extent_buffer *leaf; | 832 | struct extent_buffer *leaf; |
541 | struct rb_node *node; | 833 | struct rb_node *node; |
542 | struct list_head *pos, *n; | 834 | struct list_head *pos, *n; |
543 | struct page **pages; | ||
544 | struct page *page; | ||
545 | struct extent_state *cached_state = NULL; | 835 | struct extent_state *cached_state = NULL; |
546 | struct btrfs_free_cluster *cluster = NULL; | 836 | struct btrfs_free_cluster *cluster = NULL; |
547 | struct extent_io_tree *unpin = NULL; | 837 | struct extent_io_tree *unpin = NULL; |
838 | struct io_ctl io_ctl; | ||
548 | struct list_head bitmap_list; | 839 | struct list_head bitmap_list; |
549 | struct btrfs_key key; | 840 | struct btrfs_key key; |
550 | u64 start, end, len; | 841 | u64 start, end, len; |
551 | u64 bytes = 0; | ||
552 | u32 crc = ~(u32)0; | ||
553 | int index = 0, num_pages = 0; | ||
554 | int entries = 0; | 842 | int entries = 0; |
555 | int bitmaps = 0; | 843 | int bitmaps = 0; |
556 | int ret = -1; | 844 | int ret; |
557 | bool next_page = false; | 845 | int err = -1; |
558 | bool out_of_space = false; | ||
559 | 846 | ||
560 | INIT_LIST_HEAD(&bitmap_list); | 847 | INIT_LIST_HEAD(&bitmap_list); |
561 | 848 | ||
562 | node = rb_first(&ctl->free_space_offset); | ||
563 | if (!node) | ||
564 | return 0; | ||
565 | |||
566 | if (!i_size_read(inode)) | 849 | if (!i_size_read(inode)) |
567 | return -1; | 850 | return -1; |
568 | 851 | ||
569 | num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> | 852 | io_ctl_init(&io_ctl, inode, root); |
570 | PAGE_CACHE_SHIFT; | ||
571 | |||
572 | filemap_write_and_wait(inode->i_mapping); | ||
573 | btrfs_wait_ordered_range(inode, inode->i_size & | ||
574 | ~(root->sectorsize - 1), (u64)-1); | ||
575 | |||
576 | pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); | ||
577 | if (!pages) | ||
578 | return -1; | ||
579 | 853 | ||
580 | /* Get the cluster for this block_group if it exists */ | 854 | /* Get the cluster for this block_group if it exists */ |
581 | if (block_group && !list_empty(&block_group->cluster_list)) | 855 | if (block_group && !list_empty(&block_group->cluster_list)) |
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
589 | */ | 863 | */ |
590 | unpin = root->fs_info->pinned_extents; | 864 | unpin = root->fs_info->pinned_extents; |
591 | 865 | ||
592 | /* | 866 | /* Lock all pages first so we can lock the extent safely. */ |
593 | * Lock all pages first so we can lock the extent safely. | 867 | io_ctl_prepare_pages(&io_ctl, inode, 0); |
594 | * | ||
595 | * NOTE: Because we hold the ref the entire time we're going to write to | ||
596 | * the page find_get_page should never fail, so we don't do a check | ||
597 | * after find_get_page at this point. Just putting this here so people | ||
598 | * know and don't freak out. | ||
599 | */ | ||
600 | while (index < num_pages) { | ||
601 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
602 | if (!page) { | ||
603 | int i; | ||
604 | 868 | ||
605 | for (i = 0; i < num_pages; i++) { | ||
606 | unlock_page(pages[i]); | ||
607 | page_cache_release(pages[i]); | ||
608 | } | ||
609 | goto out; | ||
610 | } | ||
611 | pages[index] = page; | ||
612 | index++; | ||
613 | } | ||
614 | |||
615 | index = 0; | ||
616 | lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, | 869 | lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, |
617 | 0, &cached_state, GFP_NOFS); | 870 | 0, &cached_state, GFP_NOFS); |
618 | 871 | ||
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
623 | if (block_group) | 876 | if (block_group) |
624 | start = block_group->key.objectid; | 877 | start = block_group->key.objectid; |
625 | 878 | ||
626 | /* Write out the extent entries */ | 879 | node = rb_first(&ctl->free_space_offset); |
627 | do { | 880 | if (!node && cluster) { |
628 | struct btrfs_free_space_entry *entry; | 881 | node = rb_first(&cluster->root); |
629 | void *addr, *orig; | 882 | cluster = NULL; |
630 | unsigned long offset = 0; | 883 | } |
631 | 884 | ||
632 | next_page = false; | 885 | /* Make sure we can fit our crcs into the first page */ |
886 | if (io_ctl.check_crcs && | ||
887 | (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) { | ||
888 | WARN_ON(1); | ||
889 | goto out_nospc; | ||
890 | } | ||
633 | 891 | ||
634 | if (index >= num_pages) { | 892 | io_ctl_set_generation(&io_ctl, trans->transid); |
635 | out_of_space = true; | ||
636 | break; | ||
637 | } | ||
638 | 893 | ||
639 | page = pages[index]; | 894 | /* Write out the extent entries */ |
895 | while (node) { | ||
896 | struct btrfs_free_space *e; | ||
640 | 897 | ||
641 | orig = addr = kmap(page); | 898 | e = rb_entry(node, struct btrfs_free_space, offset_index); |
642 | if (index == 0) { | 899 | entries++; |
643 | u64 *gen; | ||
644 | 900 | ||
645 | /* | 901 | ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes, |
646 | * We're going to put in a bogus crc for this page to | 902 | e->bitmap); |
647 | * make sure that old kernels who aren't aware of this | 903 | if (ret) |
648 | * format will be sure to discard the cache. | 904 | goto out_nospc; |
649 | */ | ||
650 | addr += sizeof(u64); | ||
651 | offset += sizeof(u64); | ||
652 | 905 | ||
653 | gen = addr; | 906 | if (e->bitmap) { |
654 | *gen = trans->transid; | 907 | list_add_tail(&e->list, &bitmap_list); |
655 | addr += sizeof(u64); | 908 | bitmaps++; |
656 | offset += sizeof(u64); | ||
657 | } | 909 | } |
658 | entry = addr; | 910 | node = rb_next(node); |
659 | 911 | if (!node && cluster) { | |
660 | memset(addr, 0, PAGE_CACHE_SIZE - offset); | 912 | node = rb_first(&cluster->root); |
661 | while (node && !next_page) { | 913 | cluster = NULL; |
662 | struct btrfs_free_space *e; | ||
663 | |||
664 | e = rb_entry(node, struct btrfs_free_space, offset_index); | ||
665 | entries++; | ||
666 | |||
667 | entry->offset = cpu_to_le64(e->offset); | ||
668 | entry->bytes = cpu_to_le64(e->bytes); | ||
669 | if (e->bitmap) { | ||
670 | entry->type = BTRFS_FREE_SPACE_BITMAP; | ||
671 | list_add_tail(&e->list, &bitmap_list); | ||
672 | bitmaps++; | ||
673 | } else { | ||
674 | entry->type = BTRFS_FREE_SPACE_EXTENT; | ||
675 | } | ||
676 | node = rb_next(node); | ||
677 | if (!node && cluster) { | ||
678 | node = rb_first(&cluster->root); | ||
679 | cluster = NULL; | ||
680 | } | ||
681 | offset += sizeof(struct btrfs_free_space_entry); | ||
682 | if (offset + sizeof(struct btrfs_free_space_entry) >= | ||
683 | PAGE_CACHE_SIZE) | ||
684 | next_page = true; | ||
685 | entry++; | ||
686 | } | 914 | } |
915 | } | ||
687 | 916 | ||
688 | /* | 917 | /* |
689 | * We want to add any pinned extents to our free space cache | 918 | * We want to add any pinned extents to our free space cache |
690 | * so we don't leak the space | 919 | * so we don't leak the space |
691 | */ | 920 | */ |
692 | while (block_group && !next_page && | 921 | while (block_group && (start < block_group->key.objectid + |
693 | (start < block_group->key.objectid + | 922 | block_group->key.offset)) { |
694 | block_group->key.offset)) { | 923 | ret = find_first_extent_bit(unpin, start, &start, &end, |
695 | ret = find_first_extent_bit(unpin, start, &start, &end, | 924 | EXTENT_DIRTY); |
696 | EXTENT_DIRTY); | 925 | if (ret) { |
697 | if (ret) { | 926 | ret = 0; |
698 | ret = 0; | 927 | break; |
699 | break; | ||
700 | } | ||
701 | |||
702 | /* This pinned extent is out of our range */ | ||
703 | if (start >= block_group->key.objectid + | ||
704 | block_group->key.offset) | ||
705 | break; | ||
706 | |||
707 | len = block_group->key.objectid + | ||
708 | block_group->key.offset - start; | ||
709 | len = min(len, end + 1 - start); | ||
710 | |||
711 | entries++; | ||
712 | entry->offset = cpu_to_le64(start); | ||
713 | entry->bytes = cpu_to_le64(len); | ||
714 | entry->type = BTRFS_FREE_SPACE_EXTENT; | ||
715 | |||
716 | start = end + 1; | ||
717 | offset += sizeof(struct btrfs_free_space_entry); | ||
718 | if (offset + sizeof(struct btrfs_free_space_entry) >= | ||
719 | PAGE_CACHE_SIZE) | ||
720 | next_page = true; | ||
721 | entry++; | ||
722 | } | 928 | } |
723 | 929 | ||
724 | /* Generate bogus crc value */ | 930 | /* This pinned extent is out of our range */ |
725 | if (index == 0) { | 931 | if (start >= block_group->key.objectid + |
726 | u32 *tmp; | 932 | block_group->key.offset) |
727 | crc = btrfs_csum_data(root, orig + sizeof(u64), crc, | 933 | break; |
728 | PAGE_CACHE_SIZE - sizeof(u64)); | ||
729 | btrfs_csum_final(crc, (char *)&crc); | ||
730 | crc++; | ||
731 | tmp = orig; | ||
732 | *tmp = crc; | ||
733 | } | ||
734 | 934 | ||
735 | kunmap(page); | 935 | len = block_group->key.objectid + |
936 | block_group->key.offset - start; | ||
937 | len = min(len, end + 1 - start); | ||
736 | 938 | ||
737 | bytes += PAGE_CACHE_SIZE; | 939 | entries++; |
940 | ret = io_ctl_add_entry(&io_ctl, start, len, NULL); | ||
941 | if (ret) | ||
942 | goto out_nospc; | ||
738 | 943 | ||
739 | index++; | 944 | start = end + 1; |
740 | } while (node || next_page); | 945 | } |
741 | 946 | ||
742 | /* Write out the bitmaps */ | 947 | /* Write out the bitmaps */ |
743 | list_for_each_safe(pos, n, &bitmap_list) { | 948 | list_for_each_safe(pos, n, &bitmap_list) { |
744 | void *addr; | ||
745 | struct btrfs_free_space *entry = | 949 | struct btrfs_free_space *entry = |
746 | list_entry(pos, struct btrfs_free_space, list); | 950 | list_entry(pos, struct btrfs_free_space, list); |
747 | 951 | ||
748 | if (index >= num_pages) { | 952 | ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap); |
749 | out_of_space = true; | 953 | if (ret) |
750 | break; | 954 | goto out_nospc; |
751 | } | ||
752 | page = pages[index]; | ||
753 | |||
754 | addr = kmap(page); | ||
755 | memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); | ||
756 | kunmap(page); | ||
757 | bytes += PAGE_CACHE_SIZE; | ||
758 | |||
759 | list_del_init(&entry->list); | 955 | list_del_init(&entry->list); |
760 | index++; | ||
761 | } | ||
762 | |||
763 | if (out_of_space) { | ||
764 | btrfs_drop_pages(pages, num_pages); | ||
765 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, | ||
766 | i_size_read(inode) - 1, &cached_state, | ||
767 | GFP_NOFS); | ||
768 | ret = 0; | ||
769 | goto out; | ||
770 | } | 956 | } |
771 | 957 | ||
772 | /* Zero out the rest of the pages just to make sure */ | 958 | /* Zero out the rest of the pages just to make sure */ |
773 | while (index < num_pages) { | 959 | io_ctl_zero_remaining_pages(&io_ctl); |
774 | void *addr; | ||
775 | 960 | ||
776 | page = pages[index]; | 961 | ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, |
777 | addr = kmap(page); | 962 | 0, i_size_read(inode), &cached_state); |
778 | memset(addr, 0, PAGE_CACHE_SIZE); | 963 | io_ctl_drop_pages(&io_ctl); |
779 | kunmap(page); | ||
780 | bytes += PAGE_CACHE_SIZE; | ||
781 | index++; | ||
782 | } | ||
783 | |||
784 | ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, | ||
785 | bytes, &cached_state); | ||
786 | btrfs_drop_pages(pages, num_pages); | ||
787 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, | 964 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, |
788 | i_size_read(inode) - 1, &cached_state, GFP_NOFS); | 965 | i_size_read(inode) - 1, &cached_state, GFP_NOFS); |
789 | 966 | ||
790 | if (ret) { | 967 | if (ret) |
791 | ret = 0; | ||
792 | goto out; | 968 | goto out; |
793 | } | ||
794 | 969 | ||
795 | BTRFS_I(inode)->generation = trans->transid; | ||
796 | 970 | ||
797 | filemap_write_and_wait(inode->i_mapping); | 971 | ret = filemap_write_and_wait(inode->i_mapping); |
972 | if (ret) | ||
973 | goto out; | ||
798 | 974 | ||
799 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; | 975 | key.objectid = BTRFS_FREE_SPACE_OBJECTID; |
800 | key.offset = offset; | 976 | key.offset = offset; |
801 | key.type = 0; | 977 | key.type = 0; |
802 | 978 | ||
803 | ret = btrfs_search_slot(trans, root, &key, path, 1, 1); | 979 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
804 | if (ret < 0) { | 980 | if (ret < 0) { |
805 | ret = -1; | 981 | clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, |
806 | clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, | 982 | EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL, |
807 | EXTENT_DIRTY | EXTENT_DELALLOC | | 983 | GFP_NOFS); |
808 | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); | ||
809 | goto out; | 984 | goto out; |
810 | } | 985 | } |
811 | leaf = path->nodes[0]; | 986 | leaf = path->nodes[0]; |
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
816 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | 991 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); |
817 | if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || | 992 | if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || |
818 | found_key.offset != offset) { | 993 | found_key.offset != offset) { |
819 | ret = -1; | 994 | clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, |
820 | clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, | 995 | inode->i_size - 1, |
821 | EXTENT_DIRTY | EXTENT_DELALLOC | | 996 | EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, |
822 | EXTENT_DO_ACCOUNTING, 0, 0, NULL, | 997 | NULL, GFP_NOFS); |
823 | GFP_NOFS); | ||
824 | btrfs_release_path(path); | 998 | btrfs_release_path(path); |
825 | goto out; | 999 | goto out; |
826 | } | 1000 | } |
827 | } | 1001 | } |
1002 | |||
1003 | BTRFS_I(inode)->generation = trans->transid; | ||
828 | header = btrfs_item_ptr(leaf, path->slots[0], | 1004 | header = btrfs_item_ptr(leaf, path->slots[0], |
829 | struct btrfs_free_space_header); | 1005 | struct btrfs_free_space_header); |
830 | btrfs_set_free_space_entries(leaf, header, entries); | 1006 | btrfs_set_free_space_entries(leaf, header, entries); |
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
833 | btrfs_mark_buffer_dirty(leaf); | 1009 | btrfs_mark_buffer_dirty(leaf); |
834 | btrfs_release_path(path); | 1010 | btrfs_release_path(path); |
835 | 1011 | ||
836 | ret = 1; | 1012 | err = 0; |
837 | |||
838 | out: | 1013 | out: |
839 | kfree(pages); | 1014 | io_ctl_free(&io_ctl); |
840 | if (ret != 1) { | 1015 | if (err) { |
841 | invalidate_inode_pages2_range(inode->i_mapping, 0, index); | 1016 | invalidate_inode_pages2(inode->i_mapping); |
842 | BTRFS_I(inode)->generation = 0; | 1017 | BTRFS_I(inode)->generation = 0; |
843 | } | 1018 | } |
844 | btrfs_update_inode(trans, root, inode); | 1019 | btrfs_update_inode(trans, root, inode); |
845 | return ret; | 1020 | return err; |
1021 | |||
1022 | out_nospc: | ||
1023 | list_for_each_safe(pos, n, &bitmap_list) { | ||
1024 | struct btrfs_free_space *entry = | ||
1025 | list_entry(pos, struct btrfs_free_space, list); | ||
1026 | list_del_init(&entry->list); | ||
1027 | } | ||
1028 | io_ctl_drop_pages(&io_ctl); | ||
1029 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, | ||
1030 | i_size_read(inode) - 1, &cached_state, GFP_NOFS); | ||
1031 | goto out; | ||
846 | } | 1032 | } |
847 | 1033 | ||
848 | int btrfs_write_out_cache(struct btrfs_root *root, | 1034 | int btrfs_write_out_cache(struct btrfs_root *root, |
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root, | |||
869 | 1055 | ||
870 | ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, | 1056 | ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, |
871 | path, block_group->key.objectid); | 1057 | path, block_group->key.objectid); |
872 | if (ret < 0) { | 1058 | if (ret) { |
873 | spin_lock(&block_group->lock); | 1059 | spin_lock(&block_group->lock); |
874 | block_group->disk_cache_state = BTRFS_DC_ERROR; | 1060 | block_group->disk_cache_state = BTRFS_DC_ERROR; |
875 | spin_unlock(&block_group->lock); | 1061 | spin_unlock(&block_group->lock); |
876 | ret = 0; | 1062 | ret = 0; |
877 | 1063 | #ifdef DEBUG | |
878 | printk(KERN_ERR "btrfs: failed to write free space cace " | 1064 | printk(KERN_ERR "btrfs: failed to write free space cace " |
879 | "for block group %llu\n", block_group->key.objectid); | 1065 | "for block group %llu\n", block_group->key.objectid); |
1066 | #endif | ||
880 | } | 1067 | } |
881 | 1068 | ||
882 | iput(inode); | 1069 | iput(inode); |
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1283 | { | 1470 | { |
1284 | info->offset = offset_to_bitmap(ctl, offset); | 1471 | info->offset = offset_to_bitmap(ctl, offset); |
1285 | info->bytes = 0; | 1472 | info->bytes = 0; |
1473 | INIT_LIST_HEAD(&info->list); | ||
1286 | link_free_space(ctl, info); | 1474 | link_free_space(ctl, info); |
1287 | ctl->total_bitmaps++; | 1475 | ctl->total_bitmaps++; |
1288 | 1476 | ||
@@ -1662,7 +1850,13 @@ again: | |||
1662 | info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), | 1850 | info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), |
1663 | 1, 0); | 1851 | 1, 0); |
1664 | if (!info) { | 1852 | if (!info) { |
1665 | WARN_ON(1); | 1853 | /* the tree logging code might be calling us before we |
1854 | * have fully loaded the free space rbtree for this | ||
1855 | * block group. So it is possible the entry won't | ||
1856 | * be in the rbtree yet at all. The caching code | ||
1857 | * will make sure not to put it in the rbtree if | ||
1858 | * the logging code has pinned it. | ||
1859 | */ | ||
1666 | goto out_lock; | 1860 | goto out_lock; |
1667 | } | 1861 | } |
1668 | } | 1862 | } |
@@ -1701,6 +1895,7 @@ again: | |||
1701 | ctl->total_bitmaps--; | 1895 | ctl->total_bitmaps--; |
1702 | } | 1896 | } |
1703 | kmem_cache_free(btrfs_free_space_cachep, info); | 1897 | kmem_cache_free(btrfs_free_space_cachep, info); |
1898 | ret = 0; | ||
1704 | goto out_lock; | 1899 | goto out_lock; |
1705 | } | 1900 | } |
1706 | 1901 | ||
@@ -1708,7 +1903,8 @@ again: | |||
1708 | unlink_free_space(ctl, info); | 1903 | unlink_free_space(ctl, info); |
1709 | info->offset += bytes; | 1904 | info->offset += bytes; |
1710 | info->bytes -= bytes; | 1905 | info->bytes -= bytes; |
1711 | link_free_space(ctl, info); | 1906 | ret = link_free_space(ctl, info); |
1907 | WARN_ON(ret); | ||
1712 | goto out_lock; | 1908 | goto out_lock; |
1713 | } | 1909 | } |
1714 | 1910 | ||
@@ -2124,6 +2320,7 @@ again: | |||
2124 | 2320 | ||
2125 | if (!found) { | 2321 | if (!found) { |
2126 | start = i; | 2322 | start = i; |
2323 | cluster->max_size = 0; | ||
2127 | found = true; | 2324 | found = true; |
2128 | } | 2325 | } |
2129 | 2326 | ||
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2267 | { | 2464 | { |
2268 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2465 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2269 | struct btrfs_free_space *entry; | 2466 | struct btrfs_free_space *entry; |
2270 | struct rb_node *node; | ||
2271 | int ret = -ENOSPC; | 2467 | int ret = -ENOSPC; |
2468 | u64 bitmap_offset = offset_to_bitmap(ctl, offset); | ||
2272 | 2469 | ||
2273 | if (ctl->total_bitmaps == 0) | 2470 | if (ctl->total_bitmaps == 0) |
2274 | return -ENOSPC; | 2471 | return -ENOSPC; |
2275 | 2472 | ||
2276 | /* | 2473 | /* |
2277 | * First check our cached list of bitmaps and see if there is an entry | 2474 | * The bitmap that covers offset won't be in the list unless offset |
2278 | * here that will work. | 2475 | * is just its start offset. |
2279 | */ | 2476 | */ |
2477 | entry = list_first_entry(bitmaps, struct btrfs_free_space, list); | ||
2478 | if (entry->offset != bitmap_offset) { | ||
2479 | entry = tree_search_offset(ctl, bitmap_offset, 1, 0); | ||
2480 | if (entry && list_empty(&entry->list)) | ||
2481 | list_add(&entry->list, bitmaps); | ||
2482 | } | ||
2483 | |||
2280 | list_for_each_entry(entry, bitmaps, list) { | 2484 | list_for_each_entry(entry, bitmaps, list) { |
2281 | if (entry->bytes < min_bytes) | 2485 | if (entry->bytes < min_bytes) |
2282 | continue; | 2486 | continue; |
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, | |||
2287 | } | 2491 | } |
2288 | 2492 | ||
2289 | /* | 2493 | /* |
2290 | * If we do have entries on our list and we are here then we didn't find | 2494 | * The bitmaps list has all the bitmaps that record free space |
2291 | * anything, so go ahead and get the next entry after the last entry in | 2495 | * starting after offset, so no more search is required. |
2292 | * this list and start the search from there. | ||
2293 | */ | 2496 | */ |
2294 | if (!list_empty(bitmaps)) { | 2497 | return -ENOSPC; |
2295 | entry = list_entry(bitmaps->prev, struct btrfs_free_space, | ||
2296 | list); | ||
2297 | node = rb_next(&entry->offset_index); | ||
2298 | if (!node) | ||
2299 | return -ENOSPC; | ||
2300 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | ||
2301 | goto search; | ||
2302 | } | ||
2303 | |||
2304 | entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); | ||
2305 | if (!entry) | ||
2306 | return -ENOSPC; | ||
2307 | |||
2308 | search: | ||
2309 | node = &entry->offset_index; | ||
2310 | do { | ||
2311 | entry = rb_entry(node, struct btrfs_free_space, offset_index); | ||
2312 | node = rb_next(&entry->offset_index); | ||
2313 | if (!entry->bitmap) | ||
2314 | continue; | ||
2315 | if (entry->bytes < min_bytes) | ||
2316 | continue; | ||
2317 | ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, | ||
2318 | bytes, min_bytes); | ||
2319 | } while (ret && node); | ||
2320 | |||
2321 | return ret; | ||
2322 | } | 2498 | } |
2323 | 2499 | ||
2324 | /* | 2500 | /* |
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2336 | u64 offset, u64 bytes, u64 empty_size) | 2512 | u64 offset, u64 bytes, u64 empty_size) |
2337 | { | 2513 | { |
2338 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | 2514 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; |
2339 | struct list_head bitmaps; | ||
2340 | struct btrfs_free_space *entry, *tmp; | 2515 | struct btrfs_free_space *entry, *tmp; |
2516 | LIST_HEAD(bitmaps); | ||
2341 | u64 min_bytes; | 2517 | u64 min_bytes; |
2342 | int ret; | 2518 | int ret; |
2343 | 2519 | ||
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, | |||
2376 | goto out; | 2552 | goto out; |
2377 | } | 2553 | } |
2378 | 2554 | ||
2379 | INIT_LIST_HEAD(&bitmaps); | ||
2380 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, | 2555 | ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, |
2381 | bytes, min_bytes); | 2556 | bytes, min_bytes); |
2382 | if (ret) | 2557 | if (ret) |
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | |||
2472 | spin_unlock(&ctl->tree_lock); | 2647 | spin_unlock(&ctl->tree_lock); |
2473 | 2648 | ||
2474 | if (bytes >= minlen) { | 2649 | if (bytes >= minlen) { |
2475 | int update_ret; | 2650 | struct btrfs_space_info *space_info; |
2476 | update_ret = btrfs_update_reserved_bytes(block_group, | 2651 | int update = 0; |
2477 | bytes, 1, 1); | 2652 | |
2653 | space_info = block_group->space_info; | ||
2654 | spin_lock(&space_info->lock); | ||
2655 | spin_lock(&block_group->lock); | ||
2656 | if (!block_group->ro) { | ||
2657 | block_group->reserved += bytes; | ||
2658 | space_info->bytes_reserved += bytes; | ||
2659 | update = 1; | ||
2660 | } | ||
2661 | spin_unlock(&block_group->lock); | ||
2662 | spin_unlock(&space_info->lock); | ||
2478 | 2663 | ||
2479 | ret = btrfs_error_discard_extent(fs_info->extent_root, | 2664 | ret = btrfs_error_discard_extent(fs_info->extent_root, |
2480 | start, | 2665 | start, |
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | |||
2482 | &actually_trimmed); | 2667 | &actually_trimmed); |
2483 | 2668 | ||
2484 | btrfs_add_free_space(block_group, start, bytes); | 2669 | btrfs_add_free_space(block_group, start, bytes); |
2485 | if (!update_ret) | 2670 | if (update) { |
2486 | btrfs_update_reserved_bytes(block_group, | 2671 | spin_lock(&space_info->lock); |
2487 | bytes, 0, 1); | 2672 | spin_lock(&block_group->lock); |
2673 | if (block_group->ro) | ||
2674 | space_info->bytes_readonly += bytes; | ||
2675 | block_group->reserved -= bytes; | ||
2676 | space_info->bytes_reserved -= bytes; | ||
2677 | spin_unlock(&space_info->lock); | ||
2678 | spin_unlock(&block_group->lock); | ||
2679 | } | ||
2488 | 2680 | ||
2489 | if (ret) | 2681 | if (ret) |
2490 | break; | 2682 | break; |
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, | |||
2643 | return 0; | 2835 | return 0; |
2644 | 2836 | ||
2645 | ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); | 2837 | ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); |
2646 | if (ret < 0) | 2838 | if (ret) { |
2839 | btrfs_delalloc_release_metadata(inode, inode->i_size); | ||
2840 | #ifdef DEBUG | ||
2647 | printk(KERN_ERR "btrfs: failed to write free ino cache " | 2841 | printk(KERN_ERR "btrfs: failed to write free ino cache " |
2648 | "for root %llu\n", root->root_key.objectid); | 2842 | "for root %llu\n", root->root_key.objectid); |
2843 | #endif | ||
2844 | } | ||
2649 | 2845 | ||
2650 | iput(inode); | 2846 | iput(inode); |
2651 | return ret; | 2847 | return ret; |
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index b4087e0fa87..f8962a957d6 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c | |||
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, | |||
398 | struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; | 398 | struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; |
399 | struct btrfs_path *path; | 399 | struct btrfs_path *path; |
400 | struct inode *inode; | 400 | struct inode *inode; |
401 | struct btrfs_block_rsv *rsv; | ||
402 | u64 num_bytes; | ||
401 | u64 alloc_hint = 0; | 403 | u64 alloc_hint = 0; |
402 | int ret; | 404 | int ret; |
403 | int prealloc; | 405 | int prealloc; |
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, | |||
421 | if (!path) | 423 | if (!path) |
422 | return -ENOMEM; | 424 | return -ENOMEM; |
423 | 425 | ||
426 | rsv = trans->block_rsv; | ||
427 | trans->block_rsv = &root->fs_info->trans_block_rsv; | ||
428 | |||
429 | num_bytes = trans->bytes_reserved; | ||
430 | /* | ||
431 | * 1 item for inode item insertion if need | ||
432 | * 3 items for inode item update (in the worst case) | ||
433 | * 1 item for free space object | ||
434 | * 3 items for pre-allocation | ||
435 | */ | ||
436 | trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); | ||
437 | ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, | ||
438 | trans->bytes_reserved); | ||
439 | if (ret) | ||
440 | goto out; | ||
424 | again: | 441 | again: |
425 | inode = lookup_free_ino_inode(root, path); | 442 | inode = lookup_free_ino_inode(root, path); |
426 | if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { | 443 | if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { |
427 | ret = PTR_ERR(inode); | 444 | ret = PTR_ERR(inode); |
428 | goto out; | 445 | goto out_release; |
429 | } | 446 | } |
430 | 447 | ||
431 | if (IS_ERR(inode)) { | 448 | if (IS_ERR(inode)) { |
@@ -434,7 +451,7 @@ again: | |||
434 | 451 | ||
435 | ret = create_free_ino_inode(root, trans, path); | 452 | ret = create_free_ino_inode(root, trans, path); |
436 | if (ret) | 453 | if (ret) |
437 | goto out; | 454 | goto out_release; |
438 | goto again; | 455 | goto again; |
439 | } | 456 | } |
440 | 457 | ||
@@ -465,21 +482,26 @@ again: | |||
465 | /* Just to make sure we have enough space */ | 482 | /* Just to make sure we have enough space */ |
466 | prealloc += 8 * PAGE_CACHE_SIZE; | 483 | prealloc += 8 * PAGE_CACHE_SIZE; |
467 | 484 | ||
468 | ret = btrfs_check_data_free_space(inode, prealloc); | 485 | ret = btrfs_delalloc_reserve_space(inode, prealloc); |
469 | if (ret) | 486 | if (ret) |
470 | goto out_put; | 487 | goto out_put; |
471 | 488 | ||
472 | ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, | 489 | ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, |
473 | prealloc, prealloc, &alloc_hint); | 490 | prealloc, prealloc, &alloc_hint); |
474 | if (ret) | 491 | if (ret) { |
492 | btrfs_delalloc_release_space(inode, prealloc); | ||
475 | goto out_put; | 493 | goto out_put; |
494 | } | ||
476 | btrfs_free_reserved_data_space(inode, prealloc); | 495 | btrfs_free_reserved_data_space(inode, prealloc); |
477 | 496 | ||
497 | ret = btrfs_write_out_ino_cache(root, trans, path); | ||
478 | out_put: | 498 | out_put: |
479 | iput(inode); | 499 | iput(inode); |
500 | out_release: | ||
501 | btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); | ||
480 | out: | 502 | out: |
481 | if (ret == 0) | 503 | trans->block_rsv = rsv; |
482 | ret = btrfs_write_out_ino_cache(root, trans, path); | 504 | trans->bytes_reserved = num_bytes; |
483 | 505 | ||
484 | btrfs_free_path(path); | 506 | btrfs_free_path(path); |
485 | return ret; | 507 | return ret; |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75686a61bd4..2c984f7d4c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -45,10 +45,10 @@ | |||
45 | #include "btrfs_inode.h" | 45 | #include "btrfs_inode.h" |
46 | #include "ioctl.h" | 46 | #include "ioctl.h" |
47 | #include "print-tree.h" | 47 | #include "print-tree.h" |
48 | #include "volumes.h" | ||
49 | #include "ordered-data.h" | 48 | #include "ordered-data.h" |
50 | #include "xattr.h" | 49 | #include "xattr.h" |
51 | #include "tree-log.h" | 50 | #include "tree-log.h" |
51 | #include "volumes.h" | ||
52 | #include "compression.h" | 52 | #include "compression.h" |
53 | #include "locking.h" | 53 | #include "locking.h" |
54 | #include "free-space-cache.h" | 54 | #include "free-space-cache.h" |
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, | |||
93 | struct page *locked_page, | 93 | struct page *locked_page, |
94 | u64 start, u64 end, int *page_started, | 94 | u64 start, u64 end, int *page_started, |
95 | unsigned long *nr_written, int unlock); | 95 | unsigned long *nr_written, int unlock); |
96 | static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, | ||
97 | struct btrfs_root *root, struct inode *inode); | ||
96 | 98 | ||
97 | static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, | 99 | static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, |
98 | struct inode *inode, struct inode *dir, | 100 | struct inode *inode, struct inode *dir, |
@@ -393,7 +395,10 @@ again: | |||
393 | (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { | 395 | (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { |
394 | WARN_ON(pages); | 396 | WARN_ON(pages); |
395 | pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); | 397 | pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); |
396 | BUG_ON(!pages); | 398 | if (!pages) { |
399 | /* just bail out to the uncompressed code */ | ||
400 | goto cont; | ||
401 | } | ||
397 | 402 | ||
398 | if (BTRFS_I(inode)->force_compress) | 403 | if (BTRFS_I(inode)->force_compress) |
399 | compress_type = BTRFS_I(inode)->force_compress; | 404 | compress_type = BTRFS_I(inode)->force_compress; |
@@ -424,6 +429,7 @@ again: | |||
424 | will_compress = 1; | 429 | will_compress = 1; |
425 | } | 430 | } |
426 | } | 431 | } |
432 | cont: | ||
427 | if (start == 0) { | 433 | if (start == 0) { |
428 | trans = btrfs_join_transaction(root); | 434 | trans = btrfs_join_transaction(root); |
429 | BUG_ON(IS_ERR(trans)); | 435 | BUG_ON(IS_ERR(trans)); |
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode, | |||
820 | } | 826 | } |
821 | 827 | ||
822 | BUG_ON(disk_num_bytes > | 828 | BUG_ON(disk_num_bytes > |
823 | btrfs_super_total_bytes(&root->fs_info->super_copy)); | 829 | btrfs_super_total_bytes(root->fs_info->super_copy)); |
824 | 830 | ||
825 | alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); | 831 | alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); |
826 | btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); | 832 | btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); |
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | |||
1737 | trans = btrfs_join_transaction(root); | 1743 | trans = btrfs_join_transaction(root); |
1738 | BUG_ON(IS_ERR(trans)); | 1744 | BUG_ON(IS_ERR(trans)); |
1739 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; | 1745 | trans->block_rsv = &root->fs_info->delalloc_block_rsv; |
1740 | ret = btrfs_update_inode(trans, root, inode); | 1746 | ret = btrfs_update_inode_fallback(trans, root, inode); |
1741 | BUG_ON(ret); | 1747 | BUG_ON(ret); |
1742 | } | 1748 | } |
1743 | goto out; | 1749 | goto out; |
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | |||
1787 | 1793 | ||
1788 | ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); | 1794 | ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); |
1789 | if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { | 1795 | if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { |
1790 | ret = btrfs_update_inode(trans, root, inode); | 1796 | ret = btrfs_update_inode_fallback(trans, root, inode); |
1791 | BUG_ON(ret); | 1797 | BUG_ON(ret); |
1792 | } | 1798 | } |
1793 | ret = 0; | 1799 | ret = 0; |
1794 | out: | 1800 | out: |
1795 | if (nolock) { | 1801 | if (root != root->fs_info->tree_root) |
1796 | if (trans) | ||
1797 | btrfs_end_transaction_nolock(trans, root); | ||
1798 | } else { | ||
1799 | btrfs_delalloc_release_metadata(inode, ordered_extent->len); | 1802 | btrfs_delalloc_release_metadata(inode, ordered_extent->len); |
1800 | if (trans) | 1803 | if (trans) { |
1804 | if (nolock) | ||
1805 | btrfs_end_transaction_nolock(trans, root); | ||
1806 | else | ||
1801 | btrfs_end_transaction(trans, root); | 1807 | btrfs_end_transaction(trans, root); |
1802 | } | 1808 | } |
1803 | 1809 | ||
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | |||
1819 | } | 1825 | } |
1820 | 1826 | ||
1821 | /* | 1827 | /* |
1822 | * When IO fails, either with EIO or csum verification fails, we | ||
1823 | * try other mirrors that might have a good copy of the data. This | ||
1824 | * io_failure_record is used to record state as we go through all the | ||
1825 | * mirrors. If another mirror has good data, the page is set up to date | ||
1826 | * and things continue. If a good mirror can't be found, the original | ||
1827 | * bio end_io callback is called to indicate things have failed. | ||
1828 | */ | ||
1829 | struct io_failure_record { | ||
1830 | struct page *page; | ||
1831 | u64 start; | ||
1832 | u64 len; | ||
1833 | u64 logical; | ||
1834 | unsigned long bio_flags; | ||
1835 | int last_mirror; | ||
1836 | }; | ||
1837 | |||
1838 | static int btrfs_io_failed_hook(struct bio *failed_bio, | ||
1839 | struct page *page, u64 start, u64 end, | ||
1840 | struct extent_state *state) | ||
1841 | { | ||
1842 | struct io_failure_record *failrec = NULL; | ||
1843 | u64 private; | ||
1844 | struct extent_map *em; | ||
1845 | struct inode *inode = page->mapping->host; | ||
1846 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1847 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
1848 | struct bio *bio; | ||
1849 | int num_copies; | ||
1850 | int ret; | ||
1851 | int rw; | ||
1852 | u64 logical; | ||
1853 | |||
1854 | ret = get_state_private(failure_tree, start, &private); | ||
1855 | if (ret) { | ||
1856 | failrec = kmalloc(sizeof(*failrec), GFP_NOFS); | ||
1857 | if (!failrec) | ||
1858 | return -ENOMEM; | ||
1859 | failrec->start = start; | ||
1860 | failrec->len = end - start + 1; | ||
1861 | failrec->last_mirror = 0; | ||
1862 | failrec->bio_flags = 0; | ||
1863 | |||
1864 | read_lock(&em_tree->lock); | ||
1865 | em = lookup_extent_mapping(em_tree, start, failrec->len); | ||
1866 | if (em->start > start || em->start + em->len < start) { | ||
1867 | free_extent_map(em); | ||
1868 | em = NULL; | ||
1869 | } | ||
1870 | read_unlock(&em_tree->lock); | ||
1871 | |||
1872 | if (IS_ERR_OR_NULL(em)) { | ||
1873 | kfree(failrec); | ||
1874 | return -EIO; | ||
1875 | } | ||
1876 | logical = start - em->start; | ||
1877 | logical = em->block_start + logical; | ||
1878 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | ||
1879 | logical = em->block_start; | ||
1880 | failrec->bio_flags = EXTENT_BIO_COMPRESSED; | ||
1881 | extent_set_compress_type(&failrec->bio_flags, | ||
1882 | em->compress_type); | ||
1883 | } | ||
1884 | failrec->logical = logical; | ||
1885 | free_extent_map(em); | ||
1886 | set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | | ||
1887 | EXTENT_DIRTY, GFP_NOFS); | ||
1888 | set_state_private(failure_tree, start, | ||
1889 | (u64)(unsigned long)failrec); | ||
1890 | } else { | ||
1891 | failrec = (struct io_failure_record *)(unsigned long)private; | ||
1892 | } | ||
1893 | num_copies = btrfs_num_copies( | ||
1894 | &BTRFS_I(inode)->root->fs_info->mapping_tree, | ||
1895 | failrec->logical, failrec->len); | ||
1896 | failrec->last_mirror++; | ||
1897 | if (!state) { | ||
1898 | spin_lock(&BTRFS_I(inode)->io_tree.lock); | ||
1899 | state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, | ||
1900 | failrec->start, | ||
1901 | EXTENT_LOCKED); | ||
1902 | if (state && state->start != failrec->start) | ||
1903 | state = NULL; | ||
1904 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); | ||
1905 | } | ||
1906 | if (!state || failrec->last_mirror > num_copies) { | ||
1907 | set_state_private(failure_tree, failrec->start, 0); | ||
1908 | clear_extent_bits(failure_tree, failrec->start, | ||
1909 | failrec->start + failrec->len - 1, | ||
1910 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
1911 | kfree(failrec); | ||
1912 | return -EIO; | ||
1913 | } | ||
1914 | bio = bio_alloc(GFP_NOFS, 1); | ||
1915 | bio->bi_private = state; | ||
1916 | bio->bi_end_io = failed_bio->bi_end_io; | ||
1917 | bio->bi_sector = failrec->logical >> 9; | ||
1918 | bio->bi_bdev = failed_bio->bi_bdev; | ||
1919 | bio->bi_size = 0; | ||
1920 | |||
1921 | bio_add_page(bio, page, failrec->len, start - page_offset(page)); | ||
1922 | if (failed_bio->bi_rw & REQ_WRITE) | ||
1923 | rw = WRITE; | ||
1924 | else | ||
1925 | rw = READ; | ||
1926 | |||
1927 | ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, | ||
1928 | failrec->last_mirror, | ||
1929 | failrec->bio_flags, 0); | ||
1930 | return ret; | ||
1931 | } | ||
1932 | |||
1933 | /* | ||
1934 | * each time an IO finishes, we do a fast check in the IO failure tree | ||
1935 | * to see if we need to process or clean up an io_failure_record | ||
1936 | */ | ||
1937 | static int btrfs_clean_io_failures(struct inode *inode, u64 start) | ||
1938 | { | ||
1939 | u64 private; | ||
1940 | u64 private_failure; | ||
1941 | struct io_failure_record *failure; | ||
1942 | int ret; | ||
1943 | |||
1944 | private = 0; | ||
1945 | if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, | ||
1946 | (u64)-1, 1, EXTENT_DIRTY, 0)) { | ||
1947 | ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, | ||
1948 | start, &private_failure); | ||
1949 | if (ret == 0) { | ||
1950 | failure = (struct io_failure_record *)(unsigned long) | ||
1951 | private_failure; | ||
1952 | set_state_private(&BTRFS_I(inode)->io_failure_tree, | ||
1953 | failure->start, 0); | ||
1954 | clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, | ||
1955 | failure->start, | ||
1956 | failure->start + failure->len - 1, | ||
1957 | EXTENT_DIRTY | EXTENT_LOCKED, | ||
1958 | GFP_NOFS); | ||
1959 | kfree(failure); | ||
1960 | } | ||
1961 | } | ||
1962 | return 0; | ||
1963 | } | ||
1964 | |||
1965 | /* | ||
1966 | * when reads are done, we need to check csums to verify the data is correct | 1828 | * when reads are done, we need to check csums to verify the data is correct |
1967 | * if there's a match, we allow the bio to finish. If not, we go through | 1829 | * if there's a match, we allow the bio to finish. If not, the code in |
1968 | * the io_failure_record routines to find good copies | 1830 | * extent_io.c will try to find good copies for us. |
1969 | */ | 1831 | */ |
1970 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, | 1832 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, |
1971 | struct extent_state *state) | 1833 | struct extent_state *state) |
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, | |||
2011 | 1873 | ||
2012 | kunmap_atomic(kaddr, KM_USER0); | 1874 | kunmap_atomic(kaddr, KM_USER0); |
2013 | good: | 1875 | good: |
2014 | /* if the io failure tree for this inode is non-empty, | ||
2015 | * check to see if we've recovered from a failed IO | ||
2016 | */ | ||
2017 | btrfs_clean_io_failures(inode, start); | ||
2018 | return 0; | 1876 | return 0; |
2019 | 1877 | ||
2020 | zeroit: | 1878 | zeroit: |
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) | |||
2079 | up_read(&root->fs_info->cleanup_work_sem); | 1937 | up_read(&root->fs_info->cleanup_work_sem); |
2080 | } | 1938 | } |
2081 | 1939 | ||
2082 | /* | ||
2083 | * calculate extra metadata reservation when snapshotting a subvolume | ||
2084 | * contains orphan files. | ||
2085 | */ | ||
2086 | void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, | ||
2087 | struct btrfs_pending_snapshot *pending, | ||
2088 | u64 *bytes_to_reserve) | ||
2089 | { | ||
2090 | struct btrfs_root *root; | ||
2091 | struct btrfs_block_rsv *block_rsv; | ||
2092 | u64 num_bytes; | ||
2093 | int index; | ||
2094 | |||
2095 | root = pending->root; | ||
2096 | if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) | ||
2097 | return; | ||
2098 | |||
2099 | block_rsv = root->orphan_block_rsv; | ||
2100 | |||
2101 | /* orphan block reservation for the snapshot */ | ||
2102 | num_bytes = block_rsv->size; | ||
2103 | |||
2104 | /* | ||
2105 | * after the snapshot is created, COWing tree blocks may use more | ||
2106 | * space than it frees. So we should make sure there is enough | ||
2107 | * reserved space. | ||
2108 | */ | ||
2109 | index = trans->transid & 0x1; | ||
2110 | if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { | ||
2111 | num_bytes += block_rsv->size - | ||
2112 | (block_rsv->reserved + block_rsv->freed[index]); | ||
2113 | } | ||
2114 | |||
2115 | *bytes_to_reserve += num_bytes; | ||
2116 | } | ||
2117 | |||
2118 | void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, | ||
2119 | struct btrfs_pending_snapshot *pending) | ||
2120 | { | ||
2121 | struct btrfs_root *root = pending->root; | ||
2122 | struct btrfs_root *snap = pending->snap; | ||
2123 | struct btrfs_block_rsv *block_rsv; | ||
2124 | u64 num_bytes; | ||
2125 | int index; | ||
2126 | int ret; | ||
2127 | |||
2128 | if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) | ||
2129 | return; | ||
2130 | |||
2131 | /* refill source subvolume's orphan block reservation */ | ||
2132 | block_rsv = root->orphan_block_rsv; | ||
2133 | index = trans->transid & 0x1; | ||
2134 | if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { | ||
2135 | num_bytes = block_rsv->size - | ||
2136 | (block_rsv->reserved + block_rsv->freed[index]); | ||
2137 | ret = btrfs_block_rsv_migrate(&pending->block_rsv, | ||
2138 | root->orphan_block_rsv, | ||
2139 | num_bytes); | ||
2140 | BUG_ON(ret); | ||
2141 | } | ||
2142 | |||
2143 | /* setup orphan block reservation for the snapshot */ | ||
2144 | block_rsv = btrfs_alloc_block_rsv(snap); | ||
2145 | BUG_ON(!block_rsv); | ||
2146 | |||
2147 | btrfs_add_durable_block_rsv(root->fs_info, block_rsv); | ||
2148 | snap->orphan_block_rsv = block_rsv; | ||
2149 | |||
2150 | num_bytes = root->orphan_block_rsv->size; | ||
2151 | ret = btrfs_block_rsv_migrate(&pending->block_rsv, | ||
2152 | block_rsv, num_bytes); | ||
2153 | BUG_ON(ret); | ||
2154 | |||
2155 | #if 0 | ||
2156 | /* insert orphan item for the snapshot */ | ||
2157 | WARN_ON(!root->orphan_item_inserted); | ||
2158 | ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, | ||
2159 | snap->root_key.objectid); | ||
2160 | BUG_ON(ret); | ||
2161 | snap->orphan_item_inserted = 1; | ||
2162 | #endif | ||
2163 | } | ||
2164 | |||
2165 | enum btrfs_orphan_cleanup_state { | 1940 | enum btrfs_orphan_cleanup_state { |
2166 | ORPHAN_CLEANUP_STARTED = 1, | 1941 | ORPHAN_CLEANUP_STARTED = 1, |
2167 | ORPHAN_CLEANUP_DONE = 2, | 1942 | ORPHAN_CLEANUP_DONE = 2, |
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) | |||
2247 | } | 2022 | } |
2248 | spin_unlock(&root->orphan_lock); | 2023 | spin_unlock(&root->orphan_lock); |
2249 | 2024 | ||
2250 | if (block_rsv) | ||
2251 | btrfs_add_durable_block_rsv(root->fs_info, block_rsv); | ||
2252 | |||
2253 | /* grab metadata reservation from transaction handle */ | 2025 | /* grab metadata reservation from transaction handle */ |
2254 | if (reserve) { | 2026 | if (reserve) { |
2255 | ret = btrfs_orphan_reserve_metadata(trans, inode); | 2027 | ret = btrfs_orphan_reserve_metadata(trans, inode); |
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2316 | struct btrfs_key key, found_key; | 2088 | struct btrfs_key key, found_key; |
2317 | struct btrfs_trans_handle *trans; | 2089 | struct btrfs_trans_handle *trans; |
2318 | struct inode *inode; | 2090 | struct inode *inode; |
2091 | u64 last_objectid = 0; | ||
2319 | int ret = 0, nr_unlink = 0, nr_truncate = 0; | 2092 | int ret = 0, nr_unlink = 0, nr_truncate = 0; |
2320 | 2093 | ||
2321 | if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) | 2094 | if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) |
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2367 | * crossing root thing. we store the inode number in the | 2140 | * crossing root thing. we store the inode number in the |
2368 | * offset of the orphan item. | 2141 | * offset of the orphan item. |
2369 | */ | 2142 | */ |
2143 | |||
2144 | if (found_key.offset == last_objectid) { | ||
2145 | printk(KERN_ERR "btrfs: Error removing orphan entry, " | ||
2146 | "stopping orphan cleanup\n"); | ||
2147 | ret = -EINVAL; | ||
2148 | goto out; | ||
2149 | } | ||
2150 | |||
2151 | last_objectid = found_key.offset; | ||
2152 | |||
2370 | found_key.objectid = found_key.offset; | 2153 | found_key.objectid = found_key.offset; |
2371 | found_key.type = BTRFS_INODE_ITEM_KEY; | 2154 | found_key.type = BTRFS_INODE_ITEM_KEY; |
2372 | found_key.offset = 0; | 2155 | found_key.offset = 0; |
2373 | inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); | 2156 | inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); |
2374 | if (IS_ERR(inode)) { | 2157 | ret = PTR_RET(inode); |
2375 | ret = PTR_ERR(inode); | 2158 | if (ret && ret != -ESTALE) |
2376 | goto out; | 2159 | goto out; |
2377 | } | ||
2378 | 2160 | ||
2379 | /* | 2161 | /* |
2380 | * add this inode to the orphan list so btrfs_orphan_del does | 2162 | * Inode is already gone but the orphan item is still there, |
2381 | * the proper thing when we hit it | 2163 | * kill the orphan item. |
2382 | */ | 2164 | */ |
2383 | spin_lock(&root->orphan_lock); | 2165 | if (ret == -ESTALE) { |
2384 | list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); | 2166 | trans = btrfs_start_transaction(root, 1); |
2385 | spin_unlock(&root->orphan_lock); | ||
2386 | |||
2387 | /* | ||
2388 | * if this is a bad inode, means we actually succeeded in | ||
2389 | * removing the inode, but not the orphan record, which means | ||
2390 | * we need to manually delete the orphan since iput will just | ||
2391 | * do a destroy_inode | ||
2392 | */ | ||
2393 | if (is_bad_inode(inode)) { | ||
2394 | trans = btrfs_start_transaction(root, 0); | ||
2395 | if (IS_ERR(trans)) { | 2167 | if (IS_ERR(trans)) { |
2396 | ret = PTR_ERR(trans); | 2168 | ret = PTR_ERR(trans); |
2397 | goto out; | 2169 | goto out; |
2398 | } | 2170 | } |
2399 | btrfs_orphan_del(trans, inode); | 2171 | ret = btrfs_del_orphan_item(trans, root, |
2172 | found_key.objectid); | ||
2173 | BUG_ON(ret); | ||
2400 | btrfs_end_transaction(trans, root); | 2174 | btrfs_end_transaction(trans, root); |
2401 | iput(inode); | ||
2402 | continue; | 2175 | continue; |
2403 | } | 2176 | } |
2404 | 2177 | ||
2178 | /* | ||
2179 | * add this inode to the orphan list so btrfs_orphan_del does | ||
2180 | * the proper thing when we hit it | ||
2181 | */ | ||
2182 | spin_lock(&root->orphan_lock); | ||
2183 | list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); | ||
2184 | spin_unlock(&root->orphan_lock); | ||
2185 | |||
2405 | /* if we have links, this was a truncate, lets do that */ | 2186 | /* if we have links, this was a truncate, lets do that */ |
2406 | if (inode->i_nlink) { | 2187 | if (inode->i_nlink) { |
2407 | if (!S_ISREG(inode->i_mode)) { | 2188 | if (!S_ISREG(inode->i_mode)) { |
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
2420 | if (ret) | 2201 | if (ret) |
2421 | goto out; | 2202 | goto out; |
2422 | } | 2203 | } |
2204 | /* release the path since we're done with it */ | ||
2205 | btrfs_release_path(path); | ||
2206 | |||
2423 | root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; | 2207 | root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; |
2424 | 2208 | ||
2425 | if (root->orphan_block_rsv) | 2209 | if (root->orphan_block_rsv) |
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, | |||
2647 | /* | 2431 | /* |
2648 | * copy everything in the in-memory inode into the btree. | 2432 | * copy everything in the in-memory inode into the btree. |
2649 | */ | 2433 | */ |
2650 | noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, | 2434 | static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, |
2651 | struct btrfs_root *root, struct inode *inode) | 2435 | struct btrfs_root *root, struct inode *inode) |
2652 | { | 2436 | { |
2653 | struct btrfs_inode_item *inode_item; | 2437 | struct btrfs_inode_item *inode_item; |
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, | |||
2655 | struct extent_buffer *leaf; | 2439 | struct extent_buffer *leaf; |
2656 | int ret; | 2440 | int ret; |
2657 | 2441 | ||
2658 | /* | ||
2659 | * If the inode is a free space inode, we can deadlock during commit | ||
2660 | * if we put it into the delayed code. | ||
2661 | * | ||
2662 | * The data relocation inode should also be directly updated | ||
2663 | * without delay | ||
2664 | */ | ||
2665 | if (!btrfs_is_free_space_inode(root, inode) | ||
2666 | && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { | ||
2667 | ret = btrfs_delayed_update_inode(trans, root, inode); | ||
2668 | if (!ret) | ||
2669 | btrfs_set_inode_last_trans(trans, inode); | ||
2670 | return ret; | ||
2671 | } | ||
2672 | |||
2673 | path = btrfs_alloc_path(); | 2442 | path = btrfs_alloc_path(); |
2674 | if (!path) | 2443 | if (!path) |
2675 | return -ENOMEM; | 2444 | return -ENOMEM; |
@@ -2698,6 +2467,43 @@ failed: | |||
2698 | } | 2467 | } |
2699 | 2468 | ||
2700 | /* | 2469 | /* |
2470 | * copy everything in the in-memory inode into the btree. | ||
2471 | */ | ||
2472 | noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, | ||
2473 | struct btrfs_root *root, struct inode *inode) | ||
2474 | { | ||
2475 | int ret; | ||
2476 | |||
2477 | /* | ||
2478 | * If the inode is a free space inode, we can deadlock during commit | ||
2479 | * if we put it into the delayed code. | ||
2480 | * | ||
2481 | * The data relocation inode should also be directly updated | ||
2482 | * without delay | ||
2483 | */ | ||
2484 | if (!btrfs_is_free_space_inode(root, inode) | ||
2485 | && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { | ||
2486 | ret = btrfs_delayed_update_inode(trans, root, inode); | ||
2487 | if (!ret) | ||
2488 | btrfs_set_inode_last_trans(trans, inode); | ||
2489 | return ret; | ||
2490 | } | ||
2491 | |||
2492 | return btrfs_update_inode_item(trans, root, inode); | ||
2493 | } | ||
2494 | |||
2495 | static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, | ||
2496 | struct btrfs_root *root, struct inode *inode) | ||
2497 | { | ||
2498 | int ret; | ||
2499 | |||
2500 | ret = btrfs_update_inode(trans, root, inode); | ||
2501 | if (ret == -ENOSPC) | ||
2502 | return btrfs_update_inode_item(trans, root, inode); | ||
2503 | return ret; | ||
2504 | } | ||
2505 | |||
2506 | /* | ||
2701 | * unlink helper that gets used here in inode.c and in the tree logging | 2507 | * unlink helper that gets used here in inode.c and in the tree logging |
2702 | * recovery code. It remove a link in a directory with a given name, and | 2508 | * recovery code. It remove a link in a directory with a given name, and |
2703 | * also drops the back refs in the inode to the directory | 2509 | * also drops the back refs in the inode to the directory |
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, | |||
2835 | u64 ino = btrfs_ino(inode); | 2641 | u64 ino = btrfs_ino(inode); |
2836 | u64 dir_ino = btrfs_ino(dir); | 2642 | u64 dir_ino = btrfs_ino(dir); |
2837 | 2643 | ||
2838 | trans = btrfs_start_transaction(root, 10); | 2644 | /* |
2645 | * 1 for the possible orphan item | ||
2646 | * 1 for the dir item | ||
2647 | * 1 for the dir index | ||
2648 | * 1 for the inode ref | ||
2649 | * 1 for the inode ref in the tree log | ||
2650 | * 2 for the dir entries in the log | ||
2651 | * 1 for the inode | ||
2652 | */ | ||
2653 | trans = btrfs_start_transaction(root, 8); | ||
2839 | if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) | 2654 | if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) |
2840 | return trans; | 2655 | return trans; |
2841 | 2656 | ||
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, | |||
2858 | return ERR_PTR(-ENOMEM); | 2673 | return ERR_PTR(-ENOMEM); |
2859 | } | 2674 | } |
2860 | 2675 | ||
2861 | trans = btrfs_start_transaction(root, 0); | 2676 | /* 1 for the orphan item */ |
2677 | trans = btrfs_start_transaction(root, 1); | ||
2862 | if (IS_ERR(trans)) { | 2678 | if (IS_ERR(trans)) { |
2863 | btrfs_free_path(path); | 2679 | btrfs_free_path(path); |
2864 | root->fs_info->enospc_unlink = 0; | 2680 | root->fs_info->enospc_unlink = 0; |
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, | |||
2963 | err = 0; | 2779 | err = 0; |
2964 | out: | 2780 | out: |
2965 | btrfs_free_path(path); | 2781 | btrfs_free_path(path); |
2782 | /* Migrate the orphan reservation over */ | ||
2783 | if (!err) | ||
2784 | err = btrfs_block_rsv_migrate(trans->block_rsv, | ||
2785 | &root->fs_info->global_block_rsv, | ||
2786 | trans->bytes_reserved); | ||
2787 | |||
2966 | if (err) { | 2788 | if (err) { |
2967 | btrfs_end_transaction(trans, root); | 2789 | btrfs_end_transaction(trans, root); |
2968 | root->fs_info->enospc_unlink = 0; | 2790 | root->fs_info->enospc_unlink = 0; |
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans, | |||
2977 | struct btrfs_root *root) | 2799 | struct btrfs_root *root) |
2978 | { | 2800 | { |
2979 | if (trans->block_rsv == &root->fs_info->global_block_rsv) { | 2801 | if (trans->block_rsv == &root->fs_info->global_block_rsv) { |
2802 | btrfs_block_rsv_release(root, trans->block_rsv, | ||
2803 | trans->bytes_reserved); | ||
2804 | trans->block_rsv = &root->fs_info->trans_block_rsv; | ||
2980 | BUG_ON(!root->fs_info->enospc_unlink); | 2805 | BUG_ON(!root->fs_info->enospc_unlink); |
2981 | root->fs_info->enospc_unlink = 0; | 2806 | root->fs_info->enospc_unlink = 0; |
2982 | } | 2807 | } |
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) | |||
3368 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 3193 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
3369 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 3194 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
3370 | struct page *page; | 3195 | struct page *page; |
3196 | gfp_t mask = btrfs_alloc_write_mask(mapping); | ||
3371 | int ret = 0; | 3197 | int ret = 0; |
3372 | u64 page_start; | 3198 | u64 page_start; |
3373 | u64 page_end; | 3199 | u64 page_end; |
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) | |||
3380 | 3206 | ||
3381 | ret = -ENOMEM; | 3207 | ret = -ENOMEM; |
3382 | again: | 3208 | again: |
3383 | page = find_or_create_page(mapping, index, GFP_NOFS); | 3209 | page = find_or_create_page(mapping, index, mask); |
3384 | if (!page) { | 3210 | if (!page) { |
3385 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); | 3211 | btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); |
3386 | goto out; | 3212 | goto out; |
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode) | |||
3613 | { | 3439 | { |
3614 | struct btrfs_trans_handle *trans; | 3440 | struct btrfs_trans_handle *trans; |
3615 | struct btrfs_root *root = BTRFS_I(inode)->root; | 3441 | struct btrfs_root *root = BTRFS_I(inode)->root; |
3442 | struct btrfs_block_rsv *rsv, *global_rsv; | ||
3443 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | ||
3616 | unsigned long nr; | 3444 | unsigned long nr; |
3617 | int ret; | 3445 | int ret; |
3618 | 3446 | ||
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode) | |||
3640 | goto no_delete; | 3468 | goto no_delete; |
3641 | } | 3469 | } |
3642 | 3470 | ||
3471 | rsv = btrfs_alloc_block_rsv(root); | ||
3472 | if (!rsv) { | ||
3473 | btrfs_orphan_del(NULL, inode); | ||
3474 | goto no_delete; | ||
3475 | } | ||
3476 | rsv->size = min_size; | ||
3477 | global_rsv = &root->fs_info->global_block_rsv; | ||
3478 | |||
3643 | btrfs_i_size_write(inode, 0); | 3479 | btrfs_i_size_write(inode, 0); |
3644 | 3480 | ||
3481 | /* | ||
3482 | * This is a bit simpler than btrfs_truncate since | ||
3483 | * | ||
3484 | * 1) We've already reserved our space for our orphan item in the | ||
3485 | * unlink. | ||
3486 | * 2) We're going to delete the inode item, so we don't need to update | ||
3487 | * it at all. | ||
3488 | * | ||
3489 | * So we just need to reserve some slack space in case we add bytes when | ||
3490 | * doing the truncate. | ||
3491 | */ | ||
3645 | while (1) { | 3492 | while (1) { |
3646 | trans = btrfs_join_transaction(root); | 3493 | ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); |
3647 | BUG_ON(IS_ERR(trans)); | 3494 | |
3648 | trans->block_rsv = root->orphan_block_rsv; | 3495 | /* |
3496 | * Try and steal from the global reserve since we will | ||
3497 | * likely not use this space anyway, we want to try as | ||
3498 | * hard as possible to get this to work. | ||
3499 | */ | ||
3500 | if (ret) | ||
3501 | ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); | ||
3649 | 3502 | ||
3650 | ret = btrfs_block_rsv_check(trans, root, | ||
3651 | root->orphan_block_rsv, 0, 5); | ||
3652 | if (ret) { | 3503 | if (ret) { |
3653 | BUG_ON(ret != -EAGAIN); | 3504 | printk(KERN_WARNING "Could not get space for a " |
3654 | ret = btrfs_commit_transaction(trans, root); | 3505 | "delete, will truncate on mount %d\n", ret); |
3655 | BUG_ON(ret); | 3506 | btrfs_orphan_del(NULL, inode); |
3656 | continue; | 3507 | btrfs_free_block_rsv(root, rsv); |
3508 | goto no_delete; | ||
3509 | } | ||
3510 | |||
3511 | trans = btrfs_start_transaction(root, 0); | ||
3512 | if (IS_ERR(trans)) { | ||
3513 | btrfs_orphan_del(NULL, inode); | ||
3514 | btrfs_free_block_rsv(root, rsv); | ||
3515 | goto no_delete; | ||
3657 | } | 3516 | } |
3658 | 3517 | ||
3518 | trans->block_rsv = rsv; | ||
3519 | |||
3659 | ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); | 3520 | ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); |
3660 | if (ret != -EAGAIN) | 3521 | if (ret != -EAGAIN) |
3661 | break; | 3522 | break; |
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode) | |||
3664 | btrfs_end_transaction(trans, root); | 3525 | btrfs_end_transaction(trans, root); |
3665 | trans = NULL; | 3526 | trans = NULL; |
3666 | btrfs_btree_balance_dirty(root, nr); | 3527 | btrfs_btree_balance_dirty(root, nr); |
3667 | |||
3668 | } | 3528 | } |
3669 | 3529 | ||
3530 | btrfs_free_block_rsv(root, rsv); | ||
3531 | |||
3670 | if (ret == 0) { | 3532 | if (ret == 0) { |
3533 | trans->block_rsv = root->orphan_block_rsv; | ||
3671 | ret = btrfs_orphan_del(trans, inode); | 3534 | ret = btrfs_orphan_del(trans, inode); |
3672 | BUG_ON(ret); | 3535 | BUG_ON(ret); |
3673 | } | 3536 | } |
3674 | 3537 | ||
3538 | trans->block_rsv = &root->fs_info->trans_block_rsv; | ||
3675 | if (!(root == root->fs_info->tree_root || | 3539 | if (!(root == root->fs_info->tree_root || |
3676 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) | 3540 | root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) |
3677 | btrfs_return_ino(root, btrfs_ino(inode)); | 3541 | btrfs_return_ino(root, btrfs_ino(inode)); |
@@ -5795,8 +5659,7 @@ again: | |||
5795 | if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { | 5659 | if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { |
5796 | ret = btrfs_ordered_update_i_size(inode, 0, ordered); | 5660 | ret = btrfs_ordered_update_i_size(inode, 0, ordered); |
5797 | if (!ret) | 5661 | if (!ret) |
5798 | ret = btrfs_update_inode(trans, root, inode); | 5662 | err = btrfs_update_inode_fallback(trans, root, inode); |
5799 | err = ret; | ||
5800 | goto out; | 5663 | goto out; |
5801 | } | 5664 | } |
5802 | 5665 | ||
@@ -5834,7 +5697,7 @@ again: | |||
5834 | add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); | 5697 | add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); |
5835 | ret = btrfs_ordered_update_i_size(inode, 0, ordered); | 5698 | ret = btrfs_ordered_update_i_size(inode, 0, ordered); |
5836 | if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) | 5699 | if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) |
5837 | btrfs_update_inode(trans, root, inode); | 5700 | btrfs_update_inode_fallback(trans, root, inode); |
5838 | ret = 0; | 5701 | ret = 0; |
5839 | out_unlock: | 5702 | out_unlock: |
5840 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, | 5703 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, |
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page) | |||
6289 | { | 6152 | { |
6290 | struct extent_io_tree *tree; | 6153 | struct extent_io_tree *tree; |
6291 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 6154 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
6292 | return extent_read_full_page(tree, page, btrfs_get_extent); | 6155 | return extent_read_full_page(tree, page, btrfs_get_extent, 0); |
6293 | } | 6156 | } |
6294 | 6157 | ||
6295 | static int btrfs_writepage(struct page *page, struct writeback_control *wbc) | 6158 | static int btrfs_writepage(struct page *page, struct writeback_control *wbc) |
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode) | |||
6541 | struct btrfs_trans_handle *trans; | 6404 | struct btrfs_trans_handle *trans; |
6542 | unsigned long nr; | 6405 | unsigned long nr; |
6543 | u64 mask = root->sectorsize - 1; | 6406 | u64 mask = root->sectorsize - 1; |
6407 | u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); | ||
6544 | 6408 | ||
6545 | ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); | 6409 | ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); |
6546 | if (ret) | 6410 | if (ret) |
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode) | |||
6588 | rsv = btrfs_alloc_block_rsv(root); | 6452 | rsv = btrfs_alloc_block_rsv(root); |
6589 | if (!rsv) | 6453 | if (!rsv) |
6590 | return -ENOMEM; | 6454 | return -ENOMEM; |
6591 | btrfs_add_durable_block_rsv(root->fs_info, rsv); | 6455 | rsv->size = min_size; |
6592 | 6456 | ||
6457 | /* | ||
6458 | * 1 for the truncate slack space | ||
6459 | * 1 for the orphan item we're going to add | ||
6460 | * 1 for the orphan item deletion | ||
6461 | * 1 for updating the inode. | ||
6462 | */ | ||
6593 | trans = btrfs_start_transaction(root, 4); | 6463 | trans = btrfs_start_transaction(root, 4); |
6594 | if (IS_ERR(trans)) { | 6464 | if (IS_ERR(trans)) { |
6595 | err = PTR_ERR(trans); | 6465 | err = PTR_ERR(trans); |
6596 | goto out; | 6466 | goto out; |
6597 | } | 6467 | } |
6598 | 6468 | ||
6599 | /* | 6469 | /* Migrate the slack space for the truncate to our reserve */ |
6600 | * Reserve space for the truncate process. Truncate should be adding | 6470 | ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, |
6601 | * space, but if there are snapshots it may end up using space. | 6471 | min_size); |
6602 | */ | ||
6603 | ret = btrfs_truncate_reserve_metadata(trans, root, rsv); | ||
6604 | BUG_ON(ret); | 6472 | BUG_ON(ret); |
6605 | 6473 | ||
6606 | ret = btrfs_orphan_add(trans, inode); | 6474 | ret = btrfs_orphan_add(trans, inode); |
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode) | |||
6609 | goto out; | 6477 | goto out; |
6610 | } | 6478 | } |
6611 | 6479 | ||
6612 | nr = trans->blocks_used; | ||
6613 | btrfs_end_transaction(trans, root); | ||
6614 | btrfs_btree_balance_dirty(root, nr); | ||
6615 | |||
6616 | /* | ||
6617 | * Ok so we've already migrated our bytes over for the truncate, so here | ||
6618 | * just reserve the one slot we need for updating the inode. | ||
6619 | */ | ||
6620 | trans = btrfs_start_transaction(root, 1); | ||
6621 | if (IS_ERR(trans)) { | ||
6622 | err = PTR_ERR(trans); | ||
6623 | goto out; | ||
6624 | } | ||
6625 | trans->block_rsv = rsv; | ||
6626 | |||
6627 | /* | 6480 | /* |
6628 | * setattr is responsible for setting the ordered_data_close flag, | 6481 | * setattr is responsible for setting the ordered_data_close flag, |
6629 | * but that is only tested during the last file release. That | 6482 | * but that is only tested during the last file release. That |
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode) | |||
6645 | btrfs_add_ordered_operation(trans, root, inode); | 6498 | btrfs_add_ordered_operation(trans, root, inode); |
6646 | 6499 | ||
6647 | while (1) { | 6500 | while (1) { |
6501 | ret = btrfs_block_rsv_refill(root, rsv, min_size); | ||
6502 | if (ret) { | ||
6503 | /* | ||
6504 | * This can only happen with the original transaction we | ||
6505 | * started above, every other time we shouldn't have a | ||
6506 | * transaction started yet. | ||
6507 | */ | ||
6508 | if (ret == -EAGAIN) | ||
6509 | goto end_trans; | ||
6510 | err = ret; | ||
6511 | break; | ||
6512 | } | ||
6513 | |||
6648 | if (!trans) { | 6514 | if (!trans) { |
6649 | trans = btrfs_start_transaction(root, 3); | 6515 | /* Just need the 1 for updating the inode */ |
6516 | trans = btrfs_start_transaction(root, 1); | ||
6650 | if (IS_ERR(trans)) { | 6517 | if (IS_ERR(trans)) { |
6651 | err = PTR_ERR(trans); | 6518 | err = PTR_ERR(trans); |
6652 | goto out; | 6519 | goto out; |
6653 | } | 6520 | } |
6654 | |||
6655 | ret = btrfs_truncate_reserve_metadata(trans, root, | ||
6656 | rsv); | ||
6657 | BUG_ON(ret); | ||
6658 | |||
6659 | trans->block_rsv = rsv; | ||
6660 | } | 6521 | } |
6661 | 6522 | ||
6523 | trans->block_rsv = rsv; | ||
6524 | |||
6662 | ret = btrfs_truncate_inode_items(trans, root, inode, | 6525 | ret = btrfs_truncate_inode_items(trans, root, inode, |
6663 | inode->i_size, | 6526 | inode->i_size, |
6664 | BTRFS_EXTENT_DATA_KEY); | 6527 | BTRFS_EXTENT_DATA_KEY); |
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode) | |||
6673 | err = ret; | 6536 | err = ret; |
6674 | break; | 6537 | break; |
6675 | } | 6538 | } |
6676 | 6539 | end_trans: | |
6677 | nr = trans->blocks_used; | 6540 | nr = trans->blocks_used; |
6678 | btrfs_end_transaction(trans, root); | 6541 | btrfs_end_transaction(trans, root); |
6679 | trans = NULL; | 6542 | trans = NULL; |
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode) | |||
6693 | ret = btrfs_orphan_del(NULL, inode); | 6556 | ret = btrfs_orphan_del(NULL, inode); |
6694 | } | 6557 | } |
6695 | 6558 | ||
6696 | trans->block_rsv = &root->fs_info->trans_block_rsv; | 6559 | if (trans) { |
6697 | ret = btrfs_update_inode(trans, root, inode); | 6560 | trans->block_rsv = &root->fs_info->trans_block_rsv; |
6698 | if (ret && !err) | 6561 | ret = btrfs_update_inode(trans, root, inode); |
6699 | err = ret; | 6562 | if (ret && !err) |
6563 | err = ret; | ||
6700 | 6564 | ||
6701 | nr = trans->blocks_used; | 6565 | nr = trans->blocks_used; |
6702 | ret = btrfs_end_transaction_throttle(trans, root); | 6566 | ret = btrfs_end_transaction_throttle(trans, root); |
6703 | btrfs_btree_balance_dirty(root, nr); | 6567 | btrfs_btree_balance_dirty(root, nr); |
6568 | } | ||
6704 | 6569 | ||
6705 | out: | 6570 | out: |
6706 | btrfs_free_block_rsv(root, rsv); | 6571 | btrfs_free_block_rsv(root, rsv); |
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
6755 | ei->last_sub_trans = 0; | 6620 | ei->last_sub_trans = 0; |
6756 | ei->logged_trans = 0; | 6621 | ei->logged_trans = 0; |
6757 | ei->delalloc_bytes = 0; | 6622 | ei->delalloc_bytes = 0; |
6758 | ei->reserved_bytes = 0; | ||
6759 | ei->disk_i_size = 0; | 6623 | ei->disk_i_size = 0; |
6760 | ei->flags = 0; | 6624 | ei->flags = 0; |
6625 | ei->csum_bytes = 0; | ||
6761 | ei->index_cnt = (u64)-1; | 6626 | ei->index_cnt = (u64)-1; |
6762 | ei->last_unlink_trans = 0; | 6627 | ei->last_unlink_trans = 0; |
6763 | 6628 | ||
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) | |||
6769 | ei->orphan_meta_reserved = 0; | 6634 | ei->orphan_meta_reserved = 0; |
6770 | ei->dummy_inode = 0; | 6635 | ei->dummy_inode = 0; |
6771 | ei->in_defrag = 0; | 6636 | ei->in_defrag = 0; |
6637 | ei->delalloc_meta_reserved = 0; | ||
6772 | ei->force_compress = BTRFS_COMPRESS_NONE; | 6638 | ei->force_compress = BTRFS_COMPRESS_NONE; |
6773 | 6639 | ||
6774 | ei->delayed_node = NULL; | 6640 | ei->delayed_node = NULL; |
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode) | |||
6803 | WARN_ON(inode->i_data.nrpages); | 6669 | WARN_ON(inode->i_data.nrpages); |
6804 | WARN_ON(BTRFS_I(inode)->outstanding_extents); | 6670 | WARN_ON(BTRFS_I(inode)->outstanding_extents); |
6805 | WARN_ON(BTRFS_I(inode)->reserved_extents); | 6671 | WARN_ON(BTRFS_I(inode)->reserved_extents); |
6672 | WARN_ON(BTRFS_I(inode)->delalloc_bytes); | ||
6673 | WARN_ON(BTRFS_I(inode)->csum_bytes); | ||
6806 | 6674 | ||
6807 | /* | 6675 | /* |
6808 | * This can happen where we create an inode, but somebody else also | 6676 | * This can happen where we create an inode, but somebody else also |
@@ -6926,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, | |||
6926 | struct dentry *dentry, struct kstat *stat) | 6794 | struct dentry *dentry, struct kstat *stat) |
6927 | { | 6795 | { |
6928 | struct inode *inode = dentry->d_inode; | 6796 | struct inode *inode = dentry->d_inode; |
6797 | u32 blocksize = inode->i_sb->s_blocksize; | ||
6798 | |||
6929 | generic_fillattr(inode, stat); | 6799 | generic_fillattr(inode, stat); |
6930 | stat->dev = BTRFS_I(inode)->root->anon_dev; | 6800 | stat->dev = BTRFS_I(inode)->root->anon_dev; |
6931 | stat->blksize = PAGE_CACHE_SIZE; | 6801 | stat->blksize = PAGE_CACHE_SIZE; |
6932 | stat->blocks = (inode_get_bytes(inode) + | 6802 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + |
6933 | BTRFS_I(inode)->delalloc_bytes) >> 9; | 6803 | ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; |
6934 | return 0; | 6804 | return 0; |
6935 | } | 6805 | } |
6936 | 6806 | ||
@@ -7420,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = { | |||
7420 | .readpage_end_io_hook = btrfs_readpage_end_io_hook, | 7290 | .readpage_end_io_hook = btrfs_readpage_end_io_hook, |
7421 | .writepage_end_io_hook = btrfs_writepage_end_io_hook, | 7291 | .writepage_end_io_hook = btrfs_writepage_end_io_hook, |
7422 | .writepage_start_hook = btrfs_writepage_start_hook, | 7292 | .writepage_start_hook = btrfs_writepage_start_hook, |
7423 | .readpage_io_failed_hook = btrfs_io_failed_hook, | ||
7424 | .set_bit_hook = btrfs_set_bit_hook, | 7293 | .set_bit_hook = btrfs_set_bit_hook, |
7425 | .clear_bit_hook = btrfs_clear_bit_hook, | 7294 | .clear_bit_hook = btrfs_clear_bit_hook, |
7426 | .merge_extent_hook = btrfs_merge_extent_hook, | 7295 | .merge_extent_hook = btrfs_merge_extent_hook, |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dae5dfe41ba..72d461656f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include "volumes.h" | 51 | #include "volumes.h" |
52 | #include "locking.h" | 52 | #include "locking.h" |
53 | #include "inode-map.h" | 53 | #include "inode-map.h" |
54 | #include "backref.h" | ||
54 | 55 | ||
55 | /* Mask out flags that are inappropriate for the given type of inode. */ | 56 | /* Mask out flags that are inappropriate for the given type of inode. */ |
56 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) | 57 | static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) |
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode) | |||
117 | /* | 118 | /* |
118 | * Inherit flags from the parent inode. | 119 | * Inherit flags from the parent inode. |
119 | * | 120 | * |
120 | * Unlike extN we don't have any flags we don't want to inherit currently. | 121 | * Currently only the compression flags and the cow flags are inherited. |
121 | */ | 122 | */ |
122 | void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) | 123 | void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) |
123 | { | 124 | { |
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) | |||
128 | 129 | ||
129 | flags = BTRFS_I(dir)->flags; | 130 | flags = BTRFS_I(dir)->flags; |
130 | 131 | ||
131 | if (S_ISREG(inode->i_mode)) | 132 | if (flags & BTRFS_INODE_NOCOMPRESS) { |
132 | flags &= ~BTRFS_INODE_DIRSYNC; | 133 | BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; |
133 | else if (!S_ISDIR(inode->i_mode)) | 134 | BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; |
134 | flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); | 135 | } else if (flags & BTRFS_INODE_COMPRESS) { |
136 | BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; | ||
137 | BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; | ||
138 | } | ||
139 | |||
140 | if (flags & BTRFS_INODE_NODATACOW) | ||
141 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; | ||
135 | 142 | ||
136 | BTRFS_I(inode)->flags = flags; | ||
137 | btrfs_update_iflags(inode); | 143 | btrfs_update_iflags(inode); |
138 | } | 144 | } |
139 | 145 | ||
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | |||
277 | struct fstrim_range range; | 283 | struct fstrim_range range; |
278 | u64 minlen = ULLONG_MAX; | 284 | u64 minlen = ULLONG_MAX; |
279 | u64 num_devices = 0; | 285 | u64 num_devices = 0; |
286 | u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); | ||
280 | int ret; | 287 | int ret; |
281 | 288 | ||
282 | if (!capable(CAP_SYS_ADMIN)) | 289 | if (!capable(CAP_SYS_ADMIN)) |
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | |||
295 | } | 302 | } |
296 | } | 303 | } |
297 | rcu_read_unlock(); | 304 | rcu_read_unlock(); |
305 | |||
298 | if (!num_devices) | 306 | if (!num_devices) |
299 | return -EOPNOTSUPP; | 307 | return -EOPNOTSUPP; |
300 | |||
301 | if (copy_from_user(&range, arg, sizeof(range))) | 308 | if (copy_from_user(&range, arg, sizeof(range))) |
302 | return -EFAULT; | 309 | return -EFAULT; |
310 | if (range.start > total_bytes) | ||
311 | return -EINVAL; | ||
303 | 312 | ||
313 | range.len = min(range.len, total_bytes - range.start); | ||
304 | range.minlen = max(range.minlen, minlen); | 314 | range.minlen = max(range.minlen, minlen); |
305 | ret = btrfs_trim_fs(root, &range); | 315 | ret = btrfs_trim_fs(root, &range); |
306 | if (ret < 0) | 316 | if (ret < 0) |
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, | |||
760 | int ret = 1; | 770 | int ret = 1; |
761 | 771 | ||
762 | /* | 772 | /* |
763 | * make sure that once we start defragging and extent, we keep on | 773 | * make sure that once we start defragging an extent, we keep on |
764 | * defragging it | 774 | * defragging it |
765 | */ | 775 | */ |
766 | if (start < *defrag_end) | 776 | if (start < *defrag_end) |
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, | |||
805 | * extent will force at least part of that big extent to be defragged. | 815 | * extent will force at least part of that big extent to be defragged. |
806 | */ | 816 | */ |
807 | if (ret) { | 817 | if (ret) { |
808 | *last_len += len; | ||
809 | *defrag_end = extent_map_end(em); | 818 | *defrag_end = extent_map_end(em); |
810 | } else { | 819 | } else { |
811 | *last_len = 0; | 820 | *last_len = 0; |
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode, | |||
843 | int i_done; | 852 | int i_done; |
844 | struct btrfs_ordered_extent *ordered; | 853 | struct btrfs_ordered_extent *ordered; |
845 | struct extent_state *cached_state = NULL; | 854 | struct extent_state *cached_state = NULL; |
855 | gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); | ||
846 | 856 | ||
847 | if (isize == 0) | 857 | if (isize == 0) |
848 | return 0; | 858 | return 0; |
@@ -860,7 +870,7 @@ again: | |||
860 | for (i = 0; i < num_pages; i++) { | 870 | for (i = 0; i < num_pages; i++) { |
861 | struct page *page; | 871 | struct page *page; |
862 | page = find_or_create_page(inode->i_mapping, | 872 | page = find_or_create_page(inode->i_mapping, |
863 | start_index + i, GFP_NOFS); | 873 | start_index + i, mask); |
864 | if (!page) | 874 | if (!page) |
865 | break; | 875 | break; |
866 | 876 | ||
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
972 | struct btrfs_super_block *disk_super; | 982 | struct btrfs_super_block *disk_super; |
973 | struct file_ra_state *ra = NULL; | 983 | struct file_ra_state *ra = NULL; |
974 | unsigned long last_index; | 984 | unsigned long last_index; |
985 | u64 isize = i_size_read(inode); | ||
975 | u64 features; | 986 | u64 features; |
976 | u64 last_len = 0; | 987 | u64 last_len = 0; |
977 | u64 skip = 0; | 988 | u64 skip = 0; |
978 | u64 defrag_end = 0; | 989 | u64 defrag_end = 0; |
979 | u64 newer_off = range->start; | 990 | u64 newer_off = range->start; |
980 | int newer_left = 0; | ||
981 | unsigned long i; | 991 | unsigned long i; |
992 | unsigned long ra_index = 0; | ||
982 | int ret; | 993 | int ret; |
983 | int defrag_count = 0; | 994 | int defrag_count = 0; |
984 | int compress_type = BTRFS_COMPRESS_ZLIB; | 995 | int compress_type = BTRFS_COMPRESS_ZLIB; |
985 | int extent_thresh = range->extent_thresh; | 996 | int extent_thresh = range->extent_thresh; |
986 | int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; | 997 | int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; |
998 | int cluster = max_cluster; | ||
987 | u64 new_align = ~((u64)128 * 1024 - 1); | 999 | u64 new_align = ~((u64)128 * 1024 - 1); |
988 | struct page **pages = NULL; | 1000 | struct page **pages = NULL; |
989 | 1001 | ||
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
997 | compress_type = range->compress_type; | 1009 | compress_type = range->compress_type; |
998 | } | 1010 | } |
999 | 1011 | ||
1000 | if (inode->i_size == 0) | 1012 | if (isize == 0) |
1001 | return 0; | 1013 | return 0; |
1002 | 1014 | ||
1003 | /* | 1015 | /* |
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1013 | ra = &file->f_ra; | 1025 | ra = &file->f_ra; |
1014 | } | 1026 | } |
1015 | 1027 | ||
1016 | pages = kmalloc(sizeof(struct page *) * newer_cluster, | 1028 | pages = kmalloc(sizeof(struct page *) * max_cluster, |
1017 | GFP_NOFS); | 1029 | GFP_NOFS); |
1018 | if (!pages) { | 1030 | if (!pages) { |
1019 | ret = -ENOMEM; | 1031 | ret = -ENOMEM; |
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1022 | 1034 | ||
1023 | /* find the last page to defrag */ | 1035 | /* find the last page to defrag */ |
1024 | if (range->start + range->len > range->start) { | 1036 | if (range->start + range->len > range->start) { |
1025 | last_index = min_t(u64, inode->i_size - 1, | 1037 | last_index = min_t(u64, isize - 1, |
1026 | range->start + range->len - 1) >> PAGE_CACHE_SHIFT; | 1038 | range->start + range->len - 1) >> PAGE_CACHE_SHIFT; |
1027 | } else { | 1039 | } else { |
1028 | last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; | 1040 | last_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
1029 | } | 1041 | } |
1030 | 1042 | ||
1031 | if (newer_than) { | 1043 | if (newer_than) { |
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1038 | * the extents in the file evenly spaced | 1050 | * the extents in the file evenly spaced |
1039 | */ | 1051 | */ |
1040 | i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; | 1052 | i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; |
1041 | newer_left = newer_cluster; | ||
1042 | } else | 1053 | } else |
1043 | goto out_ra; | 1054 | goto out_ra; |
1044 | } else { | 1055 | } else { |
1045 | i = range->start >> PAGE_CACHE_SHIFT; | 1056 | i = range->start >> PAGE_CACHE_SHIFT; |
1046 | } | 1057 | } |
1047 | if (!max_to_defrag) | 1058 | if (!max_to_defrag) |
1048 | max_to_defrag = last_index - 1; | 1059 | max_to_defrag = last_index; |
1049 | 1060 | ||
1050 | /* | 1061 | /* |
1051 | * make writeback starts from i, so the defrag range can be | 1062 | * make writeback starts from i, so the defrag range can be |
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1079 | i = max(i + 1, next); | 1090 | i = max(i + 1, next); |
1080 | continue; | 1091 | continue; |
1081 | } | 1092 | } |
1093 | |||
1094 | if (!newer_than) { | ||
1095 | cluster = (PAGE_CACHE_ALIGN(defrag_end) >> | ||
1096 | PAGE_CACHE_SHIFT) - i; | ||
1097 | cluster = min(cluster, max_cluster); | ||
1098 | } else { | ||
1099 | cluster = max_cluster; | ||
1100 | } | ||
1101 | |||
1082 | if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) | 1102 | if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) |
1083 | BTRFS_I(inode)->force_compress = compress_type; | 1103 | BTRFS_I(inode)->force_compress = compress_type; |
1084 | 1104 | ||
1085 | btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); | 1105 | if (i + cluster > ra_index) { |
1106 | ra_index = max(i, ra_index); | ||
1107 | btrfs_force_ra(inode->i_mapping, ra, file, ra_index, | ||
1108 | cluster); | ||
1109 | ra_index += max_cluster; | ||
1110 | } | ||
1086 | 1111 | ||
1087 | ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); | 1112 | ret = cluster_pages_for_defrag(inode, pages, i, cluster); |
1088 | if (ret < 0) | 1113 | if (ret < 0) |
1089 | goto out_ra; | 1114 | goto out_ra; |
1090 | 1115 | ||
1091 | defrag_count += ret; | 1116 | defrag_count += ret; |
1092 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); | 1117 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); |
1093 | i += ret; | ||
1094 | 1118 | ||
1095 | if (newer_than) { | 1119 | if (newer_than) { |
1096 | if (newer_off == (u64)-1) | 1120 | if (newer_off == (u64)-1) |
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1105 | if (!ret) { | 1129 | if (!ret) { |
1106 | range->start = newer_off; | 1130 | range->start = newer_off; |
1107 | i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; | 1131 | i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; |
1108 | newer_left = newer_cluster; | ||
1109 | } else { | 1132 | } else { |
1110 | break; | 1133 | break; |
1111 | } | 1134 | } |
1112 | } else { | 1135 | } else { |
1113 | i++; | 1136 | if (ret > 0) { |
1137 | i += ret; | ||
1138 | last_len += ret << PAGE_CACHE_SHIFT; | ||
1139 | } else { | ||
1140 | i++; | ||
1141 | last_len = 0; | ||
1142 | } | ||
1114 | } | 1143 | } |
1115 | } | 1144 | } |
1116 | 1145 | ||
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
1136 | mutex_unlock(&inode->i_mutex); | 1165 | mutex_unlock(&inode->i_mutex); |
1137 | } | 1166 | } |
1138 | 1167 | ||
1139 | disk_super = &root->fs_info->super_copy; | 1168 | disk_super = root->fs_info->super_copy; |
1140 | features = btrfs_super_incompat_flags(disk_super); | 1169 | features = btrfs_super_incompat_flags(disk_super); |
1141 | if (range->compress_type == BTRFS_COMPRESS_LZO) { | 1170 | if (range->compress_type == BTRFS_COMPRESS_LZO) { |
1142 | features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; | 1171 | features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; |
1143 | btrfs_set_super_incompat_flags(disk_super, features); | 1172 | btrfs_set_super_incompat_flags(disk_super, features); |
1144 | } | 1173 | } |
1145 | 1174 | ||
1146 | if (!file) | 1175 | ret = defrag_count; |
1147 | kfree(ra); | ||
1148 | return defrag_count; | ||
1149 | 1176 | ||
1150 | out_ra: | 1177 | out_ra: |
1151 | if (!file) | 1178 | if (!file) |
@@ -1189,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1189 | *devstr = '\0'; | 1216 | *devstr = '\0'; |
1190 | devstr = vol_args->name; | 1217 | devstr = vol_args->name; |
1191 | devid = simple_strtoull(devstr, &end, 10); | 1218 | devid = simple_strtoull(devstr, &end, 10); |
1192 | printk(KERN_INFO "resizing devid %llu\n", | 1219 | printk(KERN_INFO "btrfs: resizing devid %llu\n", |
1193 | (unsigned long long)devid); | 1220 | (unsigned long long)devid); |
1194 | } | 1221 | } |
1195 | device = btrfs_find_device(root, devid, NULL, NULL); | 1222 | device = btrfs_find_device(root, devid, NULL, NULL); |
1196 | if (!device) { | 1223 | if (!device) { |
1197 | printk(KERN_INFO "resizer unable to find device %llu\n", | 1224 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
1198 | (unsigned long long)devid); | 1225 | (unsigned long long)devid); |
1199 | ret = -EINVAL; | 1226 | ret = -EINVAL; |
1200 | goto out_unlock; | 1227 | goto out_unlock; |
@@ -1240,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1240 | do_div(new_size, root->sectorsize); | 1267 | do_div(new_size, root->sectorsize); |
1241 | new_size *= root->sectorsize; | 1268 | new_size *= root->sectorsize; |
1242 | 1269 | ||
1243 | printk(KERN_INFO "new size for %s is %llu\n", | 1270 | printk(KERN_INFO "btrfs: new size for %s is %llu\n", |
1244 | device->name, (unsigned long long)new_size); | 1271 | device->name, (unsigned long long)new_size); |
1245 | 1272 | ||
1246 | if (new_size > old_size) { | 1273 | if (new_size > old_size) { |
@@ -1251,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, | |||
1251 | } | 1278 | } |
1252 | ret = btrfs_grow_device(trans, device, new_size); | 1279 | ret = btrfs_grow_device(trans, device, new_size); |
1253 | btrfs_commit_transaction(trans, root); | 1280 | btrfs_commit_transaction(trans, root); |
1254 | } else { | 1281 | } else if (new_size < old_size) { |
1255 | ret = btrfs_shrink_device(device, new_size); | 1282 | ret = btrfs_shrink_device(device, new_size); |
1256 | } | 1283 | } |
1257 | 1284 | ||
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2587 | return PTR_ERR(trans); | 2614 | return PTR_ERR(trans); |
2588 | } | 2615 | } |
2589 | 2616 | ||
2590 | dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); | 2617 | dir_id = btrfs_super_root_dir(root->fs_info->super_copy); |
2591 | di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, | 2618 | di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, |
2592 | dir_id, "default", 7, 1); | 2619 | dir_id, "default", 7, 1); |
2593 | if (IS_ERR_OR_NULL(di)) { | 2620 | if (IS_ERR_OR_NULL(di)) { |
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) | |||
2603 | btrfs_mark_buffer_dirty(path->nodes[0]); | 2630 | btrfs_mark_buffer_dirty(path->nodes[0]); |
2604 | btrfs_free_path(path); | 2631 | btrfs_free_path(path); |
2605 | 2632 | ||
2606 | disk_super = &root->fs_info->super_copy; | 2633 | disk_super = root->fs_info->super_copy; |
2607 | features = btrfs_super_incompat_flags(disk_super); | 2634 | features = btrfs_super_incompat_flags(disk_super); |
2608 | if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { | 2635 | if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { |
2609 | features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; | 2636 | features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; |
@@ -2864,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, | |||
2864 | return ret; | 2891 | return ret; |
2865 | } | 2892 | } |
2866 | 2893 | ||
2894 | static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) | ||
2895 | { | ||
2896 | int ret = 0; | ||
2897 | int i; | ||
2898 | u64 rel_ptr; | ||
2899 | int size; | ||
2900 | struct btrfs_ioctl_ino_path_args *ipa = NULL; | ||
2901 | struct inode_fs_paths *ipath = NULL; | ||
2902 | struct btrfs_path *path; | ||
2903 | |||
2904 | if (!capable(CAP_SYS_ADMIN)) | ||
2905 | return -EPERM; | ||
2906 | |||
2907 | path = btrfs_alloc_path(); | ||
2908 | if (!path) { | ||
2909 | ret = -ENOMEM; | ||
2910 | goto out; | ||
2911 | } | ||
2912 | |||
2913 | ipa = memdup_user(arg, sizeof(*ipa)); | ||
2914 | if (IS_ERR(ipa)) { | ||
2915 | ret = PTR_ERR(ipa); | ||
2916 | ipa = NULL; | ||
2917 | goto out; | ||
2918 | } | ||
2919 | |||
2920 | size = min_t(u32, ipa->size, 4096); | ||
2921 | ipath = init_ipath(size, root, path); | ||
2922 | if (IS_ERR(ipath)) { | ||
2923 | ret = PTR_ERR(ipath); | ||
2924 | ipath = NULL; | ||
2925 | goto out; | ||
2926 | } | ||
2927 | |||
2928 | ret = paths_from_inode(ipa->inum, ipath); | ||
2929 | if (ret < 0) | ||
2930 | goto out; | ||
2931 | |||
2932 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) { | ||
2933 | rel_ptr = ipath->fspath->val[i] - | ||
2934 | (u64)(unsigned long)ipath->fspath->val; | ||
2935 | ipath->fspath->val[i] = rel_ptr; | ||
2936 | } | ||
2937 | |||
2938 | ret = copy_to_user((void *)(unsigned long)ipa->fspath, | ||
2939 | (void *)(unsigned long)ipath->fspath, size); | ||
2940 | if (ret) { | ||
2941 | ret = -EFAULT; | ||
2942 | goto out; | ||
2943 | } | ||
2944 | |||
2945 | out: | ||
2946 | btrfs_free_path(path); | ||
2947 | free_ipath(ipath); | ||
2948 | kfree(ipa); | ||
2949 | |||
2950 | return ret; | ||
2951 | } | ||
2952 | |||
2953 | static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) | ||
2954 | { | ||
2955 | struct btrfs_data_container *inodes = ctx; | ||
2956 | const size_t c = 3 * sizeof(u64); | ||
2957 | |||
2958 | if (inodes->bytes_left >= c) { | ||
2959 | inodes->bytes_left -= c; | ||
2960 | inodes->val[inodes->elem_cnt] = inum; | ||
2961 | inodes->val[inodes->elem_cnt + 1] = offset; | ||
2962 | inodes->val[inodes->elem_cnt + 2] = root; | ||
2963 | inodes->elem_cnt += 3; | ||
2964 | } else { | ||
2965 | inodes->bytes_missing += c - inodes->bytes_left; | ||
2966 | inodes->bytes_left = 0; | ||
2967 | inodes->elem_missed += 3; | ||
2968 | } | ||
2969 | |||
2970 | return 0; | ||
2971 | } | ||
2972 | |||
2973 | static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, | ||
2974 | void __user *arg) | ||
2975 | { | ||
2976 | int ret = 0; | ||
2977 | int size; | ||
2978 | u64 extent_offset; | ||
2979 | struct btrfs_ioctl_logical_ino_args *loi; | ||
2980 | struct btrfs_data_container *inodes = NULL; | ||
2981 | struct btrfs_path *path = NULL; | ||
2982 | struct btrfs_key key; | ||
2983 | |||
2984 | if (!capable(CAP_SYS_ADMIN)) | ||
2985 | return -EPERM; | ||
2986 | |||
2987 | loi = memdup_user(arg, sizeof(*loi)); | ||
2988 | if (IS_ERR(loi)) { | ||
2989 | ret = PTR_ERR(loi); | ||
2990 | loi = NULL; | ||
2991 | goto out; | ||
2992 | } | ||
2993 | |||
2994 | path = btrfs_alloc_path(); | ||
2995 | if (!path) { | ||
2996 | ret = -ENOMEM; | ||
2997 | goto out; | ||
2998 | } | ||
2999 | |||
3000 | size = min_t(u32, loi->size, 4096); | ||
3001 | inodes = init_data_container(size); | ||
3002 | if (IS_ERR(inodes)) { | ||
3003 | ret = PTR_ERR(inodes); | ||
3004 | inodes = NULL; | ||
3005 | goto out; | ||
3006 | } | ||
3007 | |||
3008 | ret = extent_from_logical(root->fs_info, loi->logical, path, &key); | ||
3009 | |||
3010 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) | ||
3011 | ret = -ENOENT; | ||
3012 | if (ret < 0) | ||
3013 | goto out; | ||
3014 | |||
3015 | extent_offset = loi->logical - key.objectid; | ||
3016 | ret = iterate_extent_inodes(root->fs_info, path, key.objectid, | ||
3017 | extent_offset, build_ino_list, inodes); | ||
3018 | |||
3019 | if (ret < 0) | ||
3020 | goto out; | ||
3021 | |||
3022 | ret = copy_to_user((void *)(unsigned long)loi->inodes, | ||
3023 | (void *)(unsigned long)inodes, size); | ||
3024 | if (ret) | ||
3025 | ret = -EFAULT; | ||
3026 | |||
3027 | out: | ||
3028 | btrfs_free_path(path); | ||
3029 | kfree(inodes); | ||
3030 | kfree(loi); | ||
3031 | |||
3032 | return ret; | ||
3033 | } | ||
3034 | |||
2867 | long btrfs_ioctl(struct file *file, unsigned int | 3035 | long btrfs_ioctl(struct file *file, unsigned int |
2868 | cmd, unsigned long arg) | 3036 | cmd, unsigned long arg) |
2869 | { | 3037 | { |
@@ -2921,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
2921 | return btrfs_ioctl_tree_search(file, argp); | 3089 | return btrfs_ioctl_tree_search(file, argp); |
2922 | case BTRFS_IOC_INO_LOOKUP: | 3090 | case BTRFS_IOC_INO_LOOKUP: |
2923 | return btrfs_ioctl_ino_lookup(file, argp); | 3091 | return btrfs_ioctl_ino_lookup(file, argp); |
3092 | case BTRFS_IOC_INO_PATHS: | ||
3093 | return btrfs_ioctl_ino_to_path(root, argp); | ||
3094 | case BTRFS_IOC_LOGICAL_INO: | ||
3095 | return btrfs_ioctl_logical_to_ino(root, argp); | ||
2924 | case BTRFS_IOC_SPACE_INFO: | 3096 | case BTRFS_IOC_SPACE_INFO: |
2925 | return btrfs_ioctl_space_info(root, argp); | 3097 | return btrfs_ioctl_space_info(root, argp); |
2926 | case BTRFS_IOC_SYNC: | 3098 | case BTRFS_IOC_SYNC: |
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ad1ea789fcb..252ae9915de 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h | |||
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args { | |||
193 | struct btrfs_ioctl_space_info spaces[0]; | 193 | struct btrfs_ioctl_space_info spaces[0]; |
194 | }; | 194 | }; |
195 | 195 | ||
196 | struct btrfs_data_container { | ||
197 | __u32 bytes_left; /* out -- bytes not needed to deliver output */ | ||
198 | __u32 bytes_missing; /* out -- additional bytes needed for result */ | ||
199 | __u32 elem_cnt; /* out */ | ||
200 | __u32 elem_missed; /* out */ | ||
201 | __u64 val[0]; /* out */ | ||
202 | }; | ||
203 | |||
204 | struct btrfs_ioctl_ino_path_args { | ||
205 | __u64 inum; /* in */ | ||
206 | __u32 size; /* in */ | ||
207 | __u64 reserved[4]; | ||
208 | /* struct btrfs_data_container *fspath; out */ | ||
209 | __u64 fspath; /* out */ | ||
210 | }; | ||
211 | |||
212 | struct btrfs_ioctl_logical_ino_args { | ||
213 | __u64 logical; /* in */ | ||
214 | __u32 size; /* in */ | ||
215 | __u64 reserved[4]; | ||
216 | /* struct btrfs_data_container *inodes; out */ | ||
217 | __u64 inodes; | ||
218 | }; | ||
219 | |||
196 | #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ | 220 | #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ |
197 | struct btrfs_ioctl_vol_args) | 221 | struct btrfs_ioctl_vol_args) |
198 | #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ | 222 | #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ |
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args { | |||
248 | struct btrfs_ioctl_dev_info_args) | 272 | struct btrfs_ioctl_dev_info_args) |
249 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ | 273 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ |
250 | struct btrfs_ioctl_fs_info_args) | 274 | struct btrfs_ioctl_fs_info_args) |
275 | #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ | ||
276 | struct btrfs_ioctl_ino_path_args) | ||
277 | #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ | ||
278 | struct btrfs_ioctl_ino_path_args) | ||
279 | |||
251 | #endif | 280 | #endif |
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fb2605d998e..f38e452486b 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c | |||
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot) | |||
158 | void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | 158 | void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) |
159 | { | 159 | { |
160 | int i; | 160 | int i; |
161 | u32 type; | 161 | u32 type, nr; |
162 | u32 nr = btrfs_header_nritems(l); | ||
163 | struct btrfs_item *item; | 162 | struct btrfs_item *item; |
164 | struct btrfs_root_item *ri; | 163 | struct btrfs_root_item *ri; |
165 | struct btrfs_dir_item *di; | 164 | struct btrfs_dir_item *di; |
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | |||
172 | struct btrfs_key key; | 171 | struct btrfs_key key; |
173 | struct btrfs_key found_key; | 172 | struct btrfs_key found_key; |
174 | 173 | ||
174 | if (!l) | ||
175 | return; | ||
176 | |||
177 | nr = btrfs_header_nritems(l); | ||
178 | |||
175 | printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", | 179 | printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", |
176 | (unsigned long long)btrfs_header_bytenr(l), nr, | 180 | (unsigned long long)btrfs_header_bytenr(l), nr, |
177 | btrfs_leaf_free_space(root, l)); | 181 | btrfs_leaf_free_space(root, l)); |
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c new file mode 100644 index 00000000000..2373b39a132 --- /dev/null +++ b/fs/btrfs/reada.c | |||
@@ -0,0 +1,951 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 STRATO. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include <linux/pagemap.h> | ||
21 | #include <linux/writeback.h> | ||
22 | #include <linux/blkdev.h> | ||
23 | #include <linux/rbtree.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/workqueue.h> | ||
26 | #include "ctree.h" | ||
27 | #include "volumes.h" | ||
28 | #include "disk-io.h" | ||
29 | #include "transaction.h" | ||
30 | |||
31 | #undef DEBUG | ||
32 | |||
33 | /* | ||
34 | * This is the implementation for the generic read ahead framework. | ||
35 | * | ||
36 | * To trigger a readahead, btrfs_reada_add must be called. It will start | ||
37 | * a read ahead for the given range [start, end) on tree root. The returned | ||
38 | * handle can either be used to wait on the readahead to finish | ||
39 | * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach). | ||
40 | * | ||
41 | * The read ahead works as follows: | ||
42 | * On btrfs_reada_add, the root of the tree is inserted into a radix_tree. | ||
43 | * reada_start_machine will then search for extents to prefetch and trigger | ||
44 | * some reads. When a read finishes for a node, all contained node/leaf | ||
45 | * pointers that lie in the given range will also be enqueued. The reads will | ||
46 | * be triggered in sequential order, thus giving a big win over a naive | ||
47 | * enumeration. It will also make use of multi-device layouts. Each disk | ||
48 | * will have its on read pointer and all disks will by utilized in parallel. | ||
49 | * Also will no two disks read both sides of a mirror simultaneously, as this | ||
50 | * would waste seeking capacity. Instead both disks will read different parts | ||
51 | * of the filesystem. | ||
52 | * Any number of readaheads can be started in parallel. The read order will be | ||
53 | * determined globally, i.e. 2 parallel readaheads will normally finish faster | ||
54 | * than the 2 started one after another. | ||
55 | */ | ||
56 | |||
57 | #define MAX_MIRRORS 2 | ||
58 | #define MAX_IN_FLIGHT 6 | ||
59 | |||
60 | struct reada_extctl { | ||
61 | struct list_head list; | ||
62 | struct reada_control *rc; | ||
63 | u64 generation; | ||
64 | }; | ||
65 | |||
66 | struct reada_extent { | ||
67 | u64 logical; | ||
68 | struct btrfs_key top; | ||
69 | u32 blocksize; | ||
70 | int err; | ||
71 | struct list_head extctl; | ||
72 | struct kref refcnt; | ||
73 | spinlock_t lock; | ||
74 | struct reada_zone *zones[MAX_MIRRORS]; | ||
75 | int nzones; | ||
76 | struct btrfs_device *scheduled_for; | ||
77 | }; | ||
78 | |||
79 | struct reada_zone { | ||
80 | u64 start; | ||
81 | u64 end; | ||
82 | u64 elems; | ||
83 | struct list_head list; | ||
84 | spinlock_t lock; | ||
85 | int locked; | ||
86 | struct btrfs_device *device; | ||
87 | struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */ | ||
88 | int ndevs; | ||
89 | struct kref refcnt; | ||
90 | }; | ||
91 | |||
92 | struct reada_machine_work { | ||
93 | struct btrfs_work work; | ||
94 | struct btrfs_fs_info *fs_info; | ||
95 | }; | ||
96 | |||
97 | static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *); | ||
98 | static void reada_control_release(struct kref *kref); | ||
99 | static void reada_zone_release(struct kref *kref); | ||
100 | static void reada_start_machine(struct btrfs_fs_info *fs_info); | ||
101 | static void __reada_start_machine(struct btrfs_fs_info *fs_info); | ||
102 | |||
103 | static int reada_add_block(struct reada_control *rc, u64 logical, | ||
104 | struct btrfs_key *top, int level, u64 generation); | ||
105 | |||
106 | /* recurses */ | ||
107 | /* in case of err, eb might be NULL */ | ||
108 | static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, | ||
109 | u64 start, int err) | ||
110 | { | ||
111 | int level = 0; | ||
112 | int nritems; | ||
113 | int i; | ||
114 | u64 bytenr; | ||
115 | u64 generation; | ||
116 | struct reada_extent *re; | ||
117 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
118 | struct list_head list; | ||
119 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
120 | struct btrfs_device *for_dev; | ||
121 | |||
122 | if (eb) | ||
123 | level = btrfs_header_level(eb); | ||
124 | |||
125 | /* find extent */ | ||
126 | spin_lock(&fs_info->reada_lock); | ||
127 | re = radix_tree_lookup(&fs_info->reada_tree, index); | ||
128 | if (re) | ||
129 | kref_get(&re->refcnt); | ||
130 | spin_unlock(&fs_info->reada_lock); | ||
131 | |||
132 | if (!re) | ||
133 | return -1; | ||
134 | |||
135 | spin_lock(&re->lock); | ||
136 | /* | ||
137 | * just take the full list from the extent. afterwards we | ||
138 | * don't need the lock anymore | ||
139 | */ | ||
140 | list_replace_init(&re->extctl, &list); | ||
141 | for_dev = re->scheduled_for; | ||
142 | re->scheduled_for = NULL; | ||
143 | spin_unlock(&re->lock); | ||
144 | |||
145 | if (err == 0) { | ||
146 | nritems = level ? btrfs_header_nritems(eb) : 0; | ||
147 | generation = btrfs_header_generation(eb); | ||
148 | /* | ||
149 | * FIXME: currently we just set nritems to 0 if this is a leaf, | ||
150 | * effectively ignoring the content. In a next step we could | ||
151 | * trigger more readahead depending from the content, e.g. | ||
152 | * fetch the checksums for the extents in the leaf. | ||
153 | */ | ||
154 | } else { | ||
155 | /* | ||
156 | * this is the error case, the extent buffer has not been | ||
157 | * read correctly. We won't access anything from it and | ||
158 | * just cleanup our data structures. Effectively this will | ||
159 | * cut the branch below this node from read ahead. | ||
160 | */ | ||
161 | nritems = 0; | ||
162 | generation = 0; | ||
163 | } | ||
164 | |||
165 | for (i = 0; i < nritems; i++) { | ||
166 | struct reada_extctl *rec; | ||
167 | u64 n_gen; | ||
168 | struct btrfs_key key; | ||
169 | struct btrfs_key next_key; | ||
170 | |||
171 | btrfs_node_key_to_cpu(eb, &key, i); | ||
172 | if (i + 1 < nritems) | ||
173 | btrfs_node_key_to_cpu(eb, &next_key, i + 1); | ||
174 | else | ||
175 | next_key = re->top; | ||
176 | bytenr = btrfs_node_blockptr(eb, i); | ||
177 | n_gen = btrfs_node_ptr_generation(eb, i); | ||
178 | |||
179 | list_for_each_entry(rec, &list, list) { | ||
180 | struct reada_control *rc = rec->rc; | ||
181 | |||
182 | /* | ||
183 | * if the generation doesn't match, just ignore this | ||
184 | * extctl. This will probably cut off a branch from | ||
185 | * prefetch. Alternatively one could start a new (sub-) | ||
186 | * prefetch for this branch, starting again from root. | ||
187 | * FIXME: move the generation check out of this loop | ||
188 | */ | ||
189 | #ifdef DEBUG | ||
190 | if (rec->generation != generation) { | ||
191 | printk(KERN_DEBUG "generation mismatch for " | ||
192 | "(%llu,%d,%llu) %llu != %llu\n", | ||
193 | key.objectid, key.type, key.offset, | ||
194 | rec->generation, generation); | ||
195 | } | ||
196 | #endif | ||
197 | if (rec->generation == generation && | ||
198 | btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 && | ||
199 | btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0) | ||
200 | reada_add_block(rc, bytenr, &next_key, | ||
201 | level - 1, n_gen); | ||
202 | } | ||
203 | } | ||
204 | /* | ||
205 | * free extctl records | ||
206 | */ | ||
207 | while (!list_empty(&list)) { | ||
208 | struct reada_control *rc; | ||
209 | struct reada_extctl *rec; | ||
210 | |||
211 | rec = list_first_entry(&list, struct reada_extctl, list); | ||
212 | list_del(&rec->list); | ||
213 | rc = rec->rc; | ||
214 | kfree(rec); | ||
215 | |||
216 | kref_get(&rc->refcnt); | ||
217 | if (atomic_dec_and_test(&rc->elems)) { | ||
218 | kref_put(&rc->refcnt, reada_control_release); | ||
219 | wake_up(&rc->wait); | ||
220 | } | ||
221 | kref_put(&rc->refcnt, reada_control_release); | ||
222 | |||
223 | reada_extent_put(fs_info, re); /* one ref for each entry */ | ||
224 | } | ||
225 | reada_extent_put(fs_info, re); /* our ref */ | ||
226 | if (for_dev) | ||
227 | atomic_dec(&for_dev->reada_in_flight); | ||
228 | |||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | /* | ||
233 | * start is passed separately in case eb in NULL, which may be the case with | ||
234 | * failed I/O | ||
235 | */ | ||
236 | int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, | ||
237 | u64 start, int err) | ||
238 | { | ||
239 | int ret; | ||
240 | |||
241 | ret = __readahead_hook(root, eb, start, err); | ||
242 | |||
243 | reada_start_machine(root->fs_info); | ||
244 | |||
245 | return ret; | ||
246 | } | ||
247 | |||
248 | static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, | ||
249 | struct btrfs_device *dev, u64 logical, | ||
250 | struct btrfs_bio *bbio) | ||
251 | { | ||
252 | int ret; | ||
253 | int looped = 0; | ||
254 | struct reada_zone *zone; | ||
255 | struct btrfs_block_group_cache *cache = NULL; | ||
256 | u64 start; | ||
257 | u64 end; | ||
258 | int i; | ||
259 | |||
260 | again: | ||
261 | zone = NULL; | ||
262 | spin_lock(&fs_info->reada_lock); | ||
263 | ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, | ||
264 | logical >> PAGE_CACHE_SHIFT, 1); | ||
265 | if (ret == 1) | ||
266 | kref_get(&zone->refcnt); | ||
267 | spin_unlock(&fs_info->reada_lock); | ||
268 | |||
269 | if (ret == 1) { | ||
270 | if (logical >= zone->start && logical < zone->end) | ||
271 | return zone; | ||
272 | spin_lock(&fs_info->reada_lock); | ||
273 | kref_put(&zone->refcnt, reada_zone_release); | ||
274 | spin_unlock(&fs_info->reada_lock); | ||
275 | } | ||
276 | |||
277 | if (looped) | ||
278 | return NULL; | ||
279 | |||
280 | cache = btrfs_lookup_block_group(fs_info, logical); | ||
281 | if (!cache) | ||
282 | return NULL; | ||
283 | |||
284 | start = cache->key.objectid; | ||
285 | end = start + cache->key.offset - 1; | ||
286 | btrfs_put_block_group(cache); | ||
287 | |||
288 | zone = kzalloc(sizeof(*zone), GFP_NOFS); | ||
289 | if (!zone) | ||
290 | return NULL; | ||
291 | |||
292 | zone->start = start; | ||
293 | zone->end = end; | ||
294 | INIT_LIST_HEAD(&zone->list); | ||
295 | spin_lock_init(&zone->lock); | ||
296 | zone->locked = 0; | ||
297 | kref_init(&zone->refcnt); | ||
298 | zone->elems = 0; | ||
299 | zone->device = dev; /* our device always sits at index 0 */ | ||
300 | for (i = 0; i < bbio->num_stripes; ++i) { | ||
301 | /* bounds have already been checked */ | ||
302 | zone->devs[i] = bbio->stripes[i].dev; | ||
303 | } | ||
304 | zone->ndevs = bbio->num_stripes; | ||
305 | |||
306 | spin_lock(&fs_info->reada_lock); | ||
307 | ret = radix_tree_insert(&dev->reada_zones, | ||
308 | (unsigned long)zone->end >> PAGE_CACHE_SHIFT, | ||
309 | zone); | ||
310 | spin_unlock(&fs_info->reada_lock); | ||
311 | |||
312 | if (ret) { | ||
313 | kfree(zone); | ||
314 | looped = 1; | ||
315 | goto again; | ||
316 | } | ||
317 | |||
318 | return zone; | ||
319 | } | ||
320 | |||
321 | static struct reada_extent *reada_find_extent(struct btrfs_root *root, | ||
322 | u64 logical, | ||
323 | struct btrfs_key *top, int level) | ||
324 | { | ||
325 | int ret; | ||
326 | int looped = 0; | ||
327 | struct reada_extent *re = NULL; | ||
328 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
329 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
330 | struct btrfs_bio *bbio = NULL; | ||
331 | struct btrfs_device *dev; | ||
332 | u32 blocksize; | ||
333 | u64 length; | ||
334 | int nzones = 0; | ||
335 | int i; | ||
336 | unsigned long index = logical >> PAGE_CACHE_SHIFT; | ||
337 | |||
338 | again: | ||
339 | spin_lock(&fs_info->reada_lock); | ||
340 | re = radix_tree_lookup(&fs_info->reada_tree, index); | ||
341 | if (re) | ||
342 | kref_get(&re->refcnt); | ||
343 | spin_unlock(&fs_info->reada_lock); | ||
344 | |||
345 | if (re || looped) | ||
346 | return re; | ||
347 | |||
348 | re = kzalloc(sizeof(*re), GFP_NOFS); | ||
349 | if (!re) | ||
350 | return NULL; | ||
351 | |||
352 | blocksize = btrfs_level_size(root, level); | ||
353 | re->logical = logical; | ||
354 | re->blocksize = blocksize; | ||
355 | re->top = *top; | ||
356 | INIT_LIST_HEAD(&re->extctl); | ||
357 | spin_lock_init(&re->lock); | ||
358 | kref_init(&re->refcnt); | ||
359 | |||
360 | /* | ||
361 | * map block | ||
362 | */ | ||
363 | length = blocksize; | ||
364 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0); | ||
365 | if (ret || !bbio || length < blocksize) | ||
366 | goto error; | ||
367 | |||
368 | if (bbio->num_stripes > MAX_MIRRORS) { | ||
369 | printk(KERN_ERR "btrfs readahead: more than %d copies not " | ||
370 | "supported", MAX_MIRRORS); | ||
371 | goto error; | ||
372 | } | ||
373 | |||
374 | for (nzones = 0; nzones < bbio->num_stripes; ++nzones) { | ||
375 | struct reada_zone *zone; | ||
376 | |||
377 | dev = bbio->stripes[nzones].dev; | ||
378 | zone = reada_find_zone(fs_info, dev, logical, bbio); | ||
379 | if (!zone) | ||
380 | break; | ||
381 | |||
382 | re->zones[nzones] = zone; | ||
383 | spin_lock(&zone->lock); | ||
384 | if (!zone->elems) | ||
385 | kref_get(&zone->refcnt); | ||
386 | ++zone->elems; | ||
387 | spin_unlock(&zone->lock); | ||
388 | spin_lock(&fs_info->reada_lock); | ||
389 | kref_put(&zone->refcnt, reada_zone_release); | ||
390 | spin_unlock(&fs_info->reada_lock); | ||
391 | } | ||
392 | re->nzones = nzones; | ||
393 | if (nzones == 0) { | ||
394 | /* not a single zone found, error and out */ | ||
395 | goto error; | ||
396 | } | ||
397 | |||
398 | /* insert extent in reada_tree + all per-device trees, all or nothing */ | ||
399 | spin_lock(&fs_info->reada_lock); | ||
400 | ret = radix_tree_insert(&fs_info->reada_tree, index, re); | ||
401 | if (ret) { | ||
402 | spin_unlock(&fs_info->reada_lock); | ||
403 | if (ret != -ENOMEM) { | ||
404 | /* someone inserted the extent in the meantime */ | ||
405 | looped = 1; | ||
406 | } | ||
407 | goto error; | ||
408 | } | ||
409 | for (i = 0; i < nzones; ++i) { | ||
410 | dev = bbio->stripes[i].dev; | ||
411 | ret = radix_tree_insert(&dev->reada_extents, index, re); | ||
412 | if (ret) { | ||
413 | while (--i >= 0) { | ||
414 | dev = bbio->stripes[i].dev; | ||
415 | BUG_ON(dev == NULL); | ||
416 | radix_tree_delete(&dev->reada_extents, index); | ||
417 | } | ||
418 | BUG_ON(fs_info == NULL); | ||
419 | radix_tree_delete(&fs_info->reada_tree, index); | ||
420 | spin_unlock(&fs_info->reada_lock); | ||
421 | goto error; | ||
422 | } | ||
423 | } | ||
424 | spin_unlock(&fs_info->reada_lock); | ||
425 | |||
426 | kfree(bbio); | ||
427 | return re; | ||
428 | |||
429 | error: | ||
430 | while (nzones) { | ||
431 | struct reada_zone *zone; | ||
432 | |||
433 | --nzones; | ||
434 | zone = re->zones[nzones]; | ||
435 | kref_get(&zone->refcnt); | ||
436 | spin_lock(&zone->lock); | ||
437 | --zone->elems; | ||
438 | if (zone->elems == 0) { | ||
439 | /* | ||
440 | * no fs_info->reada_lock needed, as this can't be | ||
441 | * the last ref | ||
442 | */ | ||
443 | kref_put(&zone->refcnt, reada_zone_release); | ||
444 | } | ||
445 | spin_unlock(&zone->lock); | ||
446 | |||
447 | spin_lock(&fs_info->reada_lock); | ||
448 | kref_put(&zone->refcnt, reada_zone_release); | ||
449 | spin_unlock(&fs_info->reada_lock); | ||
450 | } | ||
451 | kfree(bbio); | ||
452 | kfree(re); | ||
453 | if (looped) | ||
454 | goto again; | ||
455 | return NULL; | ||
456 | } | ||
457 | |||
458 | static void reada_kref_dummy(struct kref *kr) | ||
459 | { | ||
460 | } | ||
461 | |||
462 | static void reada_extent_put(struct btrfs_fs_info *fs_info, | ||
463 | struct reada_extent *re) | ||
464 | { | ||
465 | int i; | ||
466 | unsigned long index = re->logical >> PAGE_CACHE_SHIFT; | ||
467 | |||
468 | spin_lock(&fs_info->reada_lock); | ||
469 | if (!kref_put(&re->refcnt, reada_kref_dummy)) { | ||
470 | spin_unlock(&fs_info->reada_lock); | ||
471 | return; | ||
472 | } | ||
473 | |||
474 | radix_tree_delete(&fs_info->reada_tree, index); | ||
475 | for (i = 0; i < re->nzones; ++i) { | ||
476 | struct reada_zone *zone = re->zones[i]; | ||
477 | |||
478 | radix_tree_delete(&zone->device->reada_extents, index); | ||
479 | } | ||
480 | |||
481 | spin_unlock(&fs_info->reada_lock); | ||
482 | |||
483 | for (i = 0; i < re->nzones; ++i) { | ||
484 | struct reada_zone *zone = re->zones[i]; | ||
485 | |||
486 | kref_get(&zone->refcnt); | ||
487 | spin_lock(&zone->lock); | ||
488 | --zone->elems; | ||
489 | if (zone->elems == 0) { | ||
490 | /* no fs_info->reada_lock needed, as this can't be | ||
491 | * the last ref */ | ||
492 | kref_put(&zone->refcnt, reada_zone_release); | ||
493 | } | ||
494 | spin_unlock(&zone->lock); | ||
495 | |||
496 | spin_lock(&fs_info->reada_lock); | ||
497 | kref_put(&zone->refcnt, reada_zone_release); | ||
498 | spin_unlock(&fs_info->reada_lock); | ||
499 | } | ||
500 | if (re->scheduled_for) | ||
501 | atomic_dec(&re->scheduled_for->reada_in_flight); | ||
502 | |||
503 | kfree(re); | ||
504 | } | ||
505 | |||
506 | static void reada_zone_release(struct kref *kref) | ||
507 | { | ||
508 | struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); | ||
509 | |||
510 | radix_tree_delete(&zone->device->reada_zones, | ||
511 | zone->end >> PAGE_CACHE_SHIFT); | ||
512 | |||
513 | kfree(zone); | ||
514 | } | ||
515 | |||
516 | static void reada_control_release(struct kref *kref) | ||
517 | { | ||
518 | struct reada_control *rc = container_of(kref, struct reada_control, | ||
519 | refcnt); | ||
520 | |||
521 | kfree(rc); | ||
522 | } | ||
523 | |||
524 | static int reada_add_block(struct reada_control *rc, u64 logical, | ||
525 | struct btrfs_key *top, int level, u64 generation) | ||
526 | { | ||
527 | struct btrfs_root *root = rc->root; | ||
528 | struct reada_extent *re; | ||
529 | struct reada_extctl *rec; | ||
530 | |||
531 | re = reada_find_extent(root, logical, top, level); /* takes one ref */ | ||
532 | if (!re) | ||
533 | return -1; | ||
534 | |||
535 | rec = kzalloc(sizeof(*rec), GFP_NOFS); | ||
536 | if (!rec) { | ||
537 | reada_extent_put(root->fs_info, re); | ||
538 | return -1; | ||
539 | } | ||
540 | |||
541 | rec->rc = rc; | ||
542 | rec->generation = generation; | ||
543 | atomic_inc(&rc->elems); | ||
544 | |||
545 | spin_lock(&re->lock); | ||
546 | list_add_tail(&rec->list, &re->extctl); | ||
547 | spin_unlock(&re->lock); | ||
548 | |||
549 | /* leave the ref on the extent */ | ||
550 | |||
551 | return 0; | ||
552 | } | ||
553 | |||
554 | /* | ||
555 | * called with fs_info->reada_lock held | ||
556 | */ | ||
557 | static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) | ||
558 | { | ||
559 | int i; | ||
560 | unsigned long index = zone->end >> PAGE_CACHE_SHIFT; | ||
561 | |||
562 | for (i = 0; i < zone->ndevs; ++i) { | ||
563 | struct reada_zone *peer; | ||
564 | peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index); | ||
565 | if (peer && peer->device != zone->device) | ||
566 | peer->locked = lock; | ||
567 | } | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * called with fs_info->reada_lock held | ||
572 | */ | ||
573 | static int reada_pick_zone(struct btrfs_device *dev) | ||
574 | { | ||
575 | struct reada_zone *top_zone = NULL; | ||
576 | struct reada_zone *top_locked_zone = NULL; | ||
577 | u64 top_elems = 0; | ||
578 | u64 top_locked_elems = 0; | ||
579 | unsigned long index = 0; | ||
580 | int ret; | ||
581 | |||
582 | if (dev->reada_curr_zone) { | ||
583 | reada_peer_zones_set_lock(dev->reada_curr_zone, 0); | ||
584 | kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release); | ||
585 | dev->reada_curr_zone = NULL; | ||
586 | } | ||
587 | /* pick the zone with the most elements */ | ||
588 | while (1) { | ||
589 | struct reada_zone *zone; | ||
590 | |||
591 | ret = radix_tree_gang_lookup(&dev->reada_zones, | ||
592 | (void **)&zone, index, 1); | ||
593 | if (ret == 0) | ||
594 | break; | ||
595 | index = (zone->end >> PAGE_CACHE_SHIFT) + 1; | ||
596 | if (zone->locked) { | ||
597 | if (zone->elems > top_locked_elems) { | ||
598 | top_locked_elems = zone->elems; | ||
599 | top_locked_zone = zone; | ||
600 | } | ||
601 | } else { | ||
602 | if (zone->elems > top_elems) { | ||
603 | top_elems = zone->elems; | ||
604 | top_zone = zone; | ||
605 | } | ||
606 | } | ||
607 | } | ||
608 | if (top_zone) | ||
609 | dev->reada_curr_zone = top_zone; | ||
610 | else if (top_locked_zone) | ||
611 | dev->reada_curr_zone = top_locked_zone; | ||
612 | else | ||
613 | return 0; | ||
614 | |||
615 | dev->reada_next = dev->reada_curr_zone->start; | ||
616 | kref_get(&dev->reada_curr_zone->refcnt); | ||
617 | reada_peer_zones_set_lock(dev->reada_curr_zone, 1); | ||
618 | |||
619 | return 1; | ||
620 | } | ||
621 | |||
622 | static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, | ||
623 | struct btrfs_device *dev) | ||
624 | { | ||
625 | struct reada_extent *re = NULL; | ||
626 | int mirror_num = 0; | ||
627 | struct extent_buffer *eb = NULL; | ||
628 | u64 logical; | ||
629 | u32 blocksize; | ||
630 | int ret; | ||
631 | int i; | ||
632 | int need_kick = 0; | ||
633 | |||
634 | spin_lock(&fs_info->reada_lock); | ||
635 | if (dev->reada_curr_zone == NULL) { | ||
636 | ret = reada_pick_zone(dev); | ||
637 | if (!ret) { | ||
638 | spin_unlock(&fs_info->reada_lock); | ||
639 | return 0; | ||
640 | } | ||
641 | } | ||
642 | /* | ||
643 | * FIXME currently we issue the reads one extent at a time. If we have | ||
644 | * a contiguous block of extents, we could also coagulate them or use | ||
645 | * plugging to speed things up | ||
646 | */ | ||
647 | ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, | ||
648 | dev->reada_next >> PAGE_CACHE_SHIFT, 1); | ||
649 | if (ret == 0 || re->logical >= dev->reada_curr_zone->end) { | ||
650 | ret = reada_pick_zone(dev); | ||
651 | if (!ret) { | ||
652 | spin_unlock(&fs_info->reada_lock); | ||
653 | return 0; | ||
654 | } | ||
655 | re = NULL; | ||
656 | ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, | ||
657 | dev->reada_next >> PAGE_CACHE_SHIFT, 1); | ||
658 | } | ||
659 | if (ret == 0) { | ||
660 | spin_unlock(&fs_info->reada_lock); | ||
661 | return 0; | ||
662 | } | ||
663 | dev->reada_next = re->logical + re->blocksize; | ||
664 | kref_get(&re->refcnt); | ||
665 | |||
666 | spin_unlock(&fs_info->reada_lock); | ||
667 | |||
668 | /* | ||
669 | * find mirror num | ||
670 | */ | ||
671 | for (i = 0; i < re->nzones; ++i) { | ||
672 | if (re->zones[i]->device == dev) { | ||
673 | mirror_num = i + 1; | ||
674 | break; | ||
675 | } | ||
676 | } | ||
677 | logical = re->logical; | ||
678 | blocksize = re->blocksize; | ||
679 | |||
680 | spin_lock(&re->lock); | ||
681 | if (re->scheduled_for == NULL) { | ||
682 | re->scheduled_for = dev; | ||
683 | need_kick = 1; | ||
684 | } | ||
685 | spin_unlock(&re->lock); | ||
686 | |||
687 | reada_extent_put(fs_info, re); | ||
688 | |||
689 | if (!need_kick) | ||
690 | return 0; | ||
691 | |||
692 | atomic_inc(&dev->reada_in_flight); | ||
693 | ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, | ||
694 | mirror_num, &eb); | ||
695 | if (ret) | ||
696 | __readahead_hook(fs_info->extent_root, NULL, logical, ret); | ||
697 | else if (eb) | ||
698 | __readahead_hook(fs_info->extent_root, eb, eb->start, ret); | ||
699 | |||
700 | if (eb) | ||
701 | free_extent_buffer(eb); | ||
702 | |||
703 | return 1; | ||
704 | |||
705 | } | ||
706 | |||
707 | static void reada_start_machine_worker(struct btrfs_work *work) | ||
708 | { | ||
709 | struct reada_machine_work *rmw; | ||
710 | struct btrfs_fs_info *fs_info; | ||
711 | |||
712 | rmw = container_of(work, struct reada_machine_work, work); | ||
713 | fs_info = rmw->fs_info; | ||
714 | |||
715 | kfree(rmw); | ||
716 | |||
717 | __reada_start_machine(fs_info); | ||
718 | } | ||
719 | |||
720 | static void __reada_start_machine(struct btrfs_fs_info *fs_info) | ||
721 | { | ||
722 | struct btrfs_device *device; | ||
723 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; | ||
724 | u64 enqueued; | ||
725 | u64 total = 0; | ||
726 | int i; | ||
727 | |||
728 | do { | ||
729 | enqueued = 0; | ||
730 | list_for_each_entry(device, &fs_devices->devices, dev_list) { | ||
731 | if (atomic_read(&device->reada_in_flight) < | ||
732 | MAX_IN_FLIGHT) | ||
733 | enqueued += reada_start_machine_dev(fs_info, | ||
734 | device); | ||
735 | } | ||
736 | total += enqueued; | ||
737 | } while (enqueued && total < 10000); | ||
738 | |||
739 | if (enqueued == 0) | ||
740 | return; | ||
741 | |||
742 | /* | ||
743 | * If everything is already in the cache, this is effectively single | ||
744 | * threaded. To a) not hold the caller for too long and b) to utilize | ||
745 | * more cores, we broke the loop above after 10000 iterations and now | ||
746 | * enqueue to workers to finish it. This will distribute the load to | ||
747 | * the cores. | ||
748 | */ | ||
749 | for (i = 0; i < 2; ++i) | ||
750 | reada_start_machine(fs_info); | ||
751 | } | ||
752 | |||
753 | static void reada_start_machine(struct btrfs_fs_info *fs_info) | ||
754 | { | ||
755 | struct reada_machine_work *rmw; | ||
756 | |||
757 | rmw = kzalloc(sizeof(*rmw), GFP_NOFS); | ||
758 | if (!rmw) { | ||
759 | /* FIXME we cannot handle this properly right now */ | ||
760 | BUG(); | ||
761 | } | ||
762 | rmw->work.func = reada_start_machine_worker; | ||
763 | rmw->fs_info = fs_info; | ||
764 | |||
765 | btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work); | ||
766 | } | ||
767 | |||
768 | #ifdef DEBUG | ||
769 | static void dump_devs(struct btrfs_fs_info *fs_info, int all) | ||
770 | { | ||
771 | struct btrfs_device *device; | ||
772 | struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; | ||
773 | unsigned long index; | ||
774 | int ret; | ||
775 | int i; | ||
776 | int j; | ||
777 | int cnt; | ||
778 | |||
779 | spin_lock(&fs_info->reada_lock); | ||
780 | list_for_each_entry(device, &fs_devices->devices, dev_list) { | ||
781 | printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid, | ||
782 | atomic_read(&device->reada_in_flight)); | ||
783 | index = 0; | ||
784 | while (1) { | ||
785 | struct reada_zone *zone; | ||
786 | ret = radix_tree_gang_lookup(&device->reada_zones, | ||
787 | (void **)&zone, index, 1); | ||
788 | if (ret == 0) | ||
789 | break; | ||
790 | printk(KERN_DEBUG " zone %llu-%llu elems %llu locked " | ||
791 | "%d devs", zone->start, zone->end, zone->elems, | ||
792 | zone->locked); | ||
793 | for (j = 0; j < zone->ndevs; ++j) { | ||
794 | printk(KERN_CONT " %lld", | ||
795 | zone->devs[j]->devid); | ||
796 | } | ||
797 | if (device->reada_curr_zone == zone) | ||
798 | printk(KERN_CONT " curr off %llu", | ||
799 | device->reada_next - zone->start); | ||
800 | printk(KERN_CONT "\n"); | ||
801 | index = (zone->end >> PAGE_CACHE_SHIFT) + 1; | ||
802 | } | ||
803 | cnt = 0; | ||
804 | index = 0; | ||
805 | while (all) { | ||
806 | struct reada_extent *re = NULL; | ||
807 | |||
808 | ret = radix_tree_gang_lookup(&device->reada_extents, | ||
809 | (void **)&re, index, 1); | ||
810 | if (ret == 0) | ||
811 | break; | ||
812 | printk(KERN_DEBUG | ||
813 | " re: logical %llu size %u empty %d for %lld", | ||
814 | re->logical, re->blocksize, | ||
815 | list_empty(&re->extctl), re->scheduled_for ? | ||
816 | re->scheduled_for->devid : -1); | ||
817 | |||
818 | for (i = 0; i < re->nzones; ++i) { | ||
819 | printk(KERN_CONT " zone %llu-%llu devs", | ||
820 | re->zones[i]->start, | ||
821 | re->zones[i]->end); | ||
822 | for (j = 0; j < re->zones[i]->ndevs; ++j) { | ||
823 | printk(KERN_CONT " %lld", | ||
824 | re->zones[i]->devs[j]->devid); | ||
825 | } | ||
826 | } | ||
827 | printk(KERN_CONT "\n"); | ||
828 | index = (re->logical >> PAGE_CACHE_SHIFT) + 1; | ||
829 | if (++cnt > 15) | ||
830 | break; | ||
831 | } | ||
832 | } | ||
833 | |||
834 | index = 0; | ||
835 | cnt = 0; | ||
836 | while (all) { | ||
837 | struct reada_extent *re = NULL; | ||
838 | |||
839 | ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re, | ||
840 | index, 1); | ||
841 | if (ret == 0) | ||
842 | break; | ||
843 | if (!re->scheduled_for) { | ||
844 | index = (re->logical >> PAGE_CACHE_SHIFT) + 1; | ||
845 | continue; | ||
846 | } | ||
847 | printk(KERN_DEBUG | ||
848 | "re: logical %llu size %u list empty %d for %lld", | ||
849 | re->logical, re->blocksize, list_empty(&re->extctl), | ||
850 | re->scheduled_for ? re->scheduled_for->devid : -1); | ||
851 | for (i = 0; i < re->nzones; ++i) { | ||
852 | printk(KERN_CONT " zone %llu-%llu devs", | ||
853 | re->zones[i]->start, | ||
854 | re->zones[i]->end); | ||
855 | for (i = 0; i < re->nzones; ++i) { | ||
856 | printk(KERN_CONT " zone %llu-%llu devs", | ||
857 | re->zones[i]->start, | ||
858 | re->zones[i]->end); | ||
859 | for (j = 0; j < re->zones[i]->ndevs; ++j) { | ||
860 | printk(KERN_CONT " %lld", | ||
861 | re->zones[i]->devs[j]->devid); | ||
862 | } | ||
863 | } | ||
864 | } | ||
865 | printk(KERN_CONT "\n"); | ||
866 | index = (re->logical >> PAGE_CACHE_SHIFT) + 1; | ||
867 | } | ||
868 | spin_unlock(&fs_info->reada_lock); | ||
869 | } | ||
870 | #endif | ||
871 | |||
872 | /* | ||
873 | * interface | ||
874 | */ | ||
875 | struct reada_control *btrfs_reada_add(struct btrfs_root *root, | ||
876 | struct btrfs_key *key_start, struct btrfs_key *key_end) | ||
877 | { | ||
878 | struct reada_control *rc; | ||
879 | u64 start; | ||
880 | u64 generation; | ||
881 | int level; | ||
882 | struct extent_buffer *node; | ||
883 | static struct btrfs_key max_key = { | ||
884 | .objectid = (u64)-1, | ||
885 | .type = (u8)-1, | ||
886 | .offset = (u64)-1 | ||
887 | }; | ||
888 | |||
889 | rc = kzalloc(sizeof(*rc), GFP_NOFS); | ||
890 | if (!rc) | ||
891 | return ERR_PTR(-ENOMEM); | ||
892 | |||
893 | rc->root = root; | ||
894 | rc->key_start = *key_start; | ||
895 | rc->key_end = *key_end; | ||
896 | atomic_set(&rc->elems, 0); | ||
897 | init_waitqueue_head(&rc->wait); | ||
898 | kref_init(&rc->refcnt); | ||
899 | kref_get(&rc->refcnt); /* one ref for having elements */ | ||
900 | |||
901 | node = btrfs_root_node(root); | ||
902 | start = node->start; | ||
903 | level = btrfs_header_level(node); | ||
904 | generation = btrfs_header_generation(node); | ||
905 | free_extent_buffer(node); | ||
906 | |||
907 | reada_add_block(rc, start, &max_key, level, generation); | ||
908 | |||
909 | reada_start_machine(root->fs_info); | ||
910 | |||
911 | return rc; | ||
912 | } | ||
913 | |||
914 | #ifdef DEBUG | ||
915 | int btrfs_reada_wait(void *handle) | ||
916 | { | ||
917 | struct reada_control *rc = handle; | ||
918 | |||
919 | while (atomic_read(&rc->elems)) { | ||
920 | wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, | ||
921 | 5 * HZ); | ||
922 | dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); | ||
923 | } | ||
924 | |||
925 | dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0); | ||
926 | |||
927 | kref_put(&rc->refcnt, reada_control_release); | ||
928 | |||
929 | return 0; | ||
930 | } | ||
931 | #else | ||
932 | int btrfs_reada_wait(void *handle) | ||
933 | { | ||
934 | struct reada_control *rc = handle; | ||
935 | |||
936 | while (atomic_read(&rc->elems)) { | ||
937 | wait_event(rc->wait, atomic_read(&rc->elems) == 0); | ||
938 | } | ||
939 | |||
940 | kref_put(&rc->refcnt, reada_control_release); | ||
941 | |||
942 | return 0; | ||
943 | } | ||
944 | #endif | ||
945 | |||
946 | void btrfs_reada_detach(void *handle) | ||
947 | { | ||
948 | struct reada_control *rc = handle; | ||
949 | |||
950 | kref_put(&rc->refcnt, reada_control_release); | ||
951 | } | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 59bb1764273..dff29d5e151 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, | |||
1174 | list_add_tail(&new_edge->list[UPPER], | 1174 | list_add_tail(&new_edge->list[UPPER], |
1175 | &new_node->lower); | 1175 | &new_node->lower); |
1176 | } | 1176 | } |
1177 | } else { | ||
1178 | list_add_tail(&new_node->lower, &cache->leaves); | ||
1177 | } | 1179 | } |
1178 | 1180 | ||
1179 | rb_node = tree_insert(&cache->rb_root, new_node->bytenr, | 1181 | rb_node = tree_insert(&cache->rb_root, new_node->bytenr, |
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, | |||
2041 | BUG_ON(IS_ERR(trans)); | 2043 | BUG_ON(IS_ERR(trans)); |
2042 | trans->block_rsv = rc->block_rsv; | 2044 | trans->block_rsv = rc->block_rsv; |
2043 | 2045 | ||
2044 | ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, | 2046 | ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved); |
2045 | min_reserved, 0); | ||
2046 | if (ret) { | 2047 | if (ret) { |
2047 | BUG_ON(ret != -EAGAIN); | 2048 | BUG_ON(ret != -EAGAIN); |
2048 | ret = btrfs_commit_transaction(trans, root); | 2049 | ret = btrfs_commit_transaction(trans, root); |
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err) | |||
2152 | again: | 2153 | again: |
2153 | if (!err) { | 2154 | if (!err) { |
2154 | num_bytes = rc->merging_rsv_size; | 2155 | num_bytes = rc->merging_rsv_size; |
2155 | ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, | 2156 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); |
2156 | num_bytes); | ||
2157 | if (ret) | 2157 | if (ret) |
2158 | err = ret; | 2158 | err = ret; |
2159 | } | 2159 | } |
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, | |||
2427 | num_bytes = calcu_metadata_size(rc, node, 1) * 2; | 2427 | num_bytes = calcu_metadata_size(rc, node, 1) * 2; |
2428 | 2428 | ||
2429 | trans->block_rsv = rc->block_rsv; | 2429 | trans->block_rsv = rc->block_rsv; |
2430 | ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); | 2430 | ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes); |
2431 | if (ret) { | 2431 | if (ret) { |
2432 | if (ret == -EAGAIN) | 2432 | if (ret == -EAGAIN) |
2433 | rc->commit_transaction = 1; | 2433 | rc->commit_transaction = 1; |
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
2922 | unsigned long last_index; | 2922 | unsigned long last_index; |
2923 | struct page *page; | 2923 | struct page *page; |
2924 | struct file_ra_state *ra; | 2924 | struct file_ra_state *ra; |
2925 | gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); | ||
2925 | int nr = 0; | 2926 | int nr = 0; |
2926 | int ret = 0; | 2927 | int ret = 0; |
2927 | 2928 | ||
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
2956 | ra, NULL, index, | 2957 | ra, NULL, index, |
2957 | last_index + 1 - index); | 2958 | last_index + 1 - index); |
2958 | page = find_or_create_page(inode->i_mapping, index, | 2959 | page = find_or_create_page(inode->i_mapping, index, |
2959 | GFP_NOFS); | 2960 | mask); |
2960 | if (!page) { | 2961 | if (!page) { |
2961 | btrfs_delalloc_release_metadata(inode, | 2962 | btrfs_delalloc_release_metadata(inode, |
2962 | PAGE_CACHE_SIZE); | 2963 | PAGE_CACHE_SIZE); |
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc, | |||
3323 | } | 3324 | } |
3324 | 3325 | ||
3325 | key.objectid = ref_objectid; | 3326 | key.objectid = ref_objectid; |
3326 | key.offset = ref_offset; | ||
3327 | key.type = BTRFS_EXTENT_DATA_KEY; | 3327 | key.type = BTRFS_EXTENT_DATA_KEY; |
3328 | if (ref_offset > ((u64)-1 << 32)) | ||
3329 | key.offset = 0; | ||
3330 | else | ||
3331 | key.offset = ref_offset; | ||
3328 | 3332 | ||
3329 | path->search_commit_root = 1; | 3333 | path->search_commit_root = 1; |
3330 | path->skip_locking = 1; | 3334 | path->skip_locking = 1; |
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc) | |||
3645 | * btrfs_init_reloc_root will use them when there | 3649 | * btrfs_init_reloc_root will use them when there |
3646 | * is no reservation in transaction handle. | 3650 | * is no reservation in transaction handle. |
3647 | */ | 3651 | */ |
3648 | ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, | 3652 | ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv, |
3649 | rc->extent_root->nodesize * 256); | 3653 | rc->extent_root->nodesize * 256); |
3650 | if (ret) | 3654 | if (ret) |
3651 | return ret; | 3655 | return ret; |
3652 | 3656 | ||
3653 | rc->block_rsv->refill_used = 1; | ||
3654 | btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); | ||
3655 | |||
3656 | memset(&rc->cluster, 0, sizeof(rc->cluster)); | 3657 | memset(&rc->cluster, 0, sizeof(rc->cluster)); |
3657 | rc->search_start = rc->block_group->key.objectid; | 3658 | rc->search_start = rc->block_group->key.objectid; |
3658 | rc->extents_found = 0; | 3659 | rc->extents_found = 0; |
@@ -3777,8 +3778,7 @@ restart: | |||
3777 | } | 3778 | } |
3778 | } | 3779 | } |
3779 | 3780 | ||
3780 | ret = btrfs_block_rsv_check(trans, rc->extent_root, | 3781 | ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); |
3781 | rc->block_rsv, 0, 5); | ||
3782 | if (ret < 0) { | 3782 | if (ret < 0) { |
3783 | if (ret != -EAGAIN) { | 3783 | if (ret != -EAGAIN) { |
3784 | err = ret; | 3784 | err = ret; |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5..c27bcb67f33 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -17,10 +17,14 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/ratelimit.h> | ||
20 | #include "ctree.h" | 21 | #include "ctree.h" |
21 | #include "volumes.h" | 22 | #include "volumes.h" |
22 | #include "disk-io.h" | 23 | #include "disk-io.h" |
23 | #include "ordered-data.h" | 24 | #include "ordered-data.h" |
25 | #include "transaction.h" | ||
26 | #include "backref.h" | ||
27 | #include "extent_io.h" | ||
24 | 28 | ||
25 | /* | 29 | /* |
26 | * This is only the first step towards a full-features scrub. It reads all | 30 | * This is only the first step towards a full-features scrub. It reads all |
@@ -29,15 +33,12 @@ | |||
29 | * any can be found. | 33 | * any can be found. |
30 | * | 34 | * |
31 | * Future enhancements: | 35 | * Future enhancements: |
32 | * - To enhance the performance, better read-ahead strategies for the | ||
33 | * extent-tree can be employed. | ||
34 | * - In case an unrepairable extent is encountered, track which files are | 36 | * - In case an unrepairable extent is encountered, track which files are |
35 | * affected and report them | 37 | * affected and report them |
36 | * - In case of a read error on files with nodatasum, map the file and read | 38 | * - In case of a read error on files with nodatasum, map the file and read |
37 | * the extent to trigger a writeback of the good copy | 39 | * the extent to trigger a writeback of the good copy |
38 | * - track and record media errors, throw out bad devices | 40 | * - track and record media errors, throw out bad devices |
39 | * - add a mode to also read unallocated space | 41 | * - add a mode to also read unallocated space |
40 | * - make the prefetch cancellable | ||
41 | */ | 42 | */ |
42 | 43 | ||
43 | struct scrub_bio; | 44 | struct scrub_bio; |
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); | |||
63 | struct scrub_page { | 64 | struct scrub_page { |
64 | u64 flags; /* extent flags */ | 65 | u64 flags; /* extent flags */ |
65 | u64 generation; | 66 | u64 generation; |
66 | u64 mirror_num; | 67 | int mirror_num; |
67 | int have_csum; | 68 | int have_csum; |
68 | u8 csum[BTRFS_CSUM_SIZE]; | 69 | u8 csum[BTRFS_CSUM_SIZE]; |
69 | }; | 70 | }; |
@@ -87,6 +88,7 @@ struct scrub_dev { | |||
87 | int first_free; | 88 | int first_free; |
88 | int curr; | 89 | int curr; |
89 | atomic_t in_flight; | 90 | atomic_t in_flight; |
91 | atomic_t fixup_cnt; | ||
90 | spinlock_t list_lock; | 92 | spinlock_t list_lock; |
91 | wait_queue_head_t list_wait; | 93 | wait_queue_head_t list_wait; |
92 | u16 csum_size; | 94 | u16 csum_size; |
@@ -100,6 +102,27 @@ struct scrub_dev { | |||
100 | spinlock_t stat_lock; | 102 | spinlock_t stat_lock; |
101 | }; | 103 | }; |
102 | 104 | ||
105 | struct scrub_fixup_nodatasum { | ||
106 | struct scrub_dev *sdev; | ||
107 | u64 logical; | ||
108 | struct btrfs_root *root; | ||
109 | struct btrfs_work work; | ||
110 | int mirror_num; | ||
111 | }; | ||
112 | |||
113 | struct scrub_warning { | ||
114 | struct btrfs_path *path; | ||
115 | u64 extent_item_size; | ||
116 | char *scratch_buf; | ||
117 | char *msg_buf; | ||
118 | const char *errstr; | ||
119 | sector_t sector; | ||
120 | u64 logical; | ||
121 | struct btrfs_device *dev; | ||
122 | int msg_bufsize; | ||
123 | int scratch_bufsize; | ||
124 | }; | ||
125 | |||
103 | static void scrub_free_csums(struct scrub_dev *sdev) | 126 | static void scrub_free_csums(struct scrub_dev *sdev) |
104 | { | 127 | { |
105 | while (!list_empty(&sdev->csum_list)) { | 128 | while (!list_empty(&sdev->csum_list)) { |
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
175 | 198 | ||
176 | if (i != SCRUB_BIOS_PER_DEV-1) | 199 | if (i != SCRUB_BIOS_PER_DEV-1) |
177 | sdev->bios[i]->next_free = i + 1; | 200 | sdev->bios[i]->next_free = i + 1; |
178 | else | 201 | else |
179 | sdev->bios[i]->next_free = -1; | 202 | sdev->bios[i]->next_free = -1; |
180 | } | 203 | } |
181 | sdev->first_free = 0; | 204 | sdev->first_free = 0; |
182 | sdev->curr = -1; | 205 | sdev->curr = -1; |
183 | atomic_set(&sdev->in_flight, 0); | 206 | atomic_set(&sdev->in_flight, 0); |
207 | atomic_set(&sdev->fixup_cnt, 0); | ||
184 | atomic_set(&sdev->cancel_req, 0); | 208 | atomic_set(&sdev->cancel_req, 0); |
185 | sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); | 209 | sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); |
186 | INIT_LIST_HEAD(&sdev->csum_list); | 210 | INIT_LIST_HEAD(&sdev->csum_list); |
187 | 211 | ||
188 | spin_lock_init(&sdev->list_lock); | 212 | spin_lock_init(&sdev->list_lock); |
@@ -195,24 +219,366 @@ nomem: | |||
195 | return ERR_PTR(-ENOMEM); | 219 | return ERR_PTR(-ENOMEM); |
196 | } | 220 | } |
197 | 221 | ||
222 | static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | ||
223 | { | ||
224 | u64 isize; | ||
225 | u32 nlink; | ||
226 | int ret; | ||
227 | int i; | ||
228 | struct extent_buffer *eb; | ||
229 | struct btrfs_inode_item *inode_item; | ||
230 | struct scrub_warning *swarn = ctx; | ||
231 | struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; | ||
232 | struct inode_fs_paths *ipath = NULL; | ||
233 | struct btrfs_root *local_root; | ||
234 | struct btrfs_key root_key; | ||
235 | |||
236 | root_key.objectid = root; | ||
237 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
238 | root_key.offset = (u64)-1; | ||
239 | local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
240 | if (IS_ERR(local_root)) { | ||
241 | ret = PTR_ERR(local_root); | ||
242 | goto err; | ||
243 | } | ||
244 | |||
245 | ret = inode_item_info(inum, 0, local_root, swarn->path); | ||
246 | if (ret) { | ||
247 | btrfs_release_path(swarn->path); | ||
248 | goto err; | ||
249 | } | ||
250 | |||
251 | eb = swarn->path->nodes[0]; | ||
252 | inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], | ||
253 | struct btrfs_inode_item); | ||
254 | isize = btrfs_inode_size(eb, inode_item); | ||
255 | nlink = btrfs_inode_nlink(eb, inode_item); | ||
256 | btrfs_release_path(swarn->path); | ||
257 | |||
258 | ipath = init_ipath(4096, local_root, swarn->path); | ||
259 | if (IS_ERR(ipath)) { | ||
260 | ret = PTR_ERR(ipath); | ||
261 | ipath = NULL; | ||
262 | goto err; | ||
263 | } | ||
264 | ret = paths_from_inode(inum, ipath); | ||
265 | |||
266 | if (ret < 0) | ||
267 | goto err; | ||
268 | |||
269 | /* | ||
270 | * we deliberately ignore the bit ipath might have been too small to | ||
271 | * hold all of the paths here | ||
272 | */ | ||
273 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) | ||
274 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
275 | "%s, sector %llu, root %llu, inode %llu, offset %llu, " | ||
276 | "length %llu, links %u (path: %s)\n", swarn->errstr, | ||
277 | swarn->logical, swarn->dev->name, | ||
278 | (unsigned long long)swarn->sector, root, inum, offset, | ||
279 | min(isize - offset, (u64)PAGE_SIZE), nlink, | ||
280 | (char *)(unsigned long)ipath->fspath->val[i]); | ||
281 | |||
282 | free_ipath(ipath); | ||
283 | return 0; | ||
284 | |||
285 | err: | ||
286 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
287 | "%s, sector %llu, root %llu, inode %llu, offset %llu: path " | ||
288 | "resolving failed with ret=%d\n", swarn->errstr, | ||
289 | swarn->logical, swarn->dev->name, | ||
290 | (unsigned long long)swarn->sector, root, inum, offset, ret); | ||
291 | |||
292 | free_ipath(ipath); | ||
293 | return 0; | ||
294 | } | ||
295 | |||
296 | static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | ||
297 | int ix) | ||
298 | { | ||
299 | struct btrfs_device *dev = sbio->sdev->dev; | ||
300 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | ||
301 | struct btrfs_path *path; | ||
302 | struct btrfs_key found_key; | ||
303 | struct extent_buffer *eb; | ||
304 | struct btrfs_extent_item *ei; | ||
305 | struct scrub_warning swarn; | ||
306 | u32 item_size; | ||
307 | int ret; | ||
308 | u64 ref_root; | ||
309 | u8 ref_level; | ||
310 | unsigned long ptr = 0; | ||
311 | const int bufsize = 4096; | ||
312 | u64 extent_offset; | ||
313 | |||
314 | path = btrfs_alloc_path(); | ||
315 | |||
316 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); | ||
317 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); | ||
318 | swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
319 | swarn.logical = sbio->logical + ix * PAGE_SIZE; | ||
320 | swarn.errstr = errstr; | ||
321 | swarn.dev = dev; | ||
322 | swarn.msg_bufsize = bufsize; | ||
323 | swarn.scratch_bufsize = bufsize; | ||
324 | |||
325 | if (!path || !swarn.scratch_buf || !swarn.msg_buf) | ||
326 | goto out; | ||
327 | |||
328 | ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); | ||
329 | if (ret < 0) | ||
330 | goto out; | ||
331 | |||
332 | extent_offset = swarn.logical - found_key.objectid; | ||
333 | swarn.extent_item_size = found_key.offset; | ||
334 | |||
335 | eb = path->nodes[0]; | ||
336 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
337 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
338 | |||
339 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
340 | do { | ||
341 | ret = tree_backref_for_extent(&ptr, eb, ei, item_size, | ||
342 | &ref_root, &ref_level); | ||
343 | printk(KERN_WARNING "%s at logical %llu on dev %s, " | ||
344 | "sector %llu: metadata %s (level %d) in tree " | ||
345 | "%llu\n", errstr, swarn.logical, dev->name, | ||
346 | (unsigned long long)swarn.sector, | ||
347 | ref_level ? "node" : "leaf", | ||
348 | ret < 0 ? -1 : ref_level, | ||
349 | ret < 0 ? -1 : ref_root); | ||
350 | } while (ret != 1); | ||
351 | } else { | ||
352 | swarn.path = path; | ||
353 | iterate_extent_inodes(fs_info, path, found_key.objectid, | ||
354 | extent_offset, | ||
355 | scrub_print_warning_inode, &swarn); | ||
356 | } | ||
357 | |||
358 | out: | ||
359 | btrfs_free_path(path); | ||
360 | kfree(swarn.scratch_buf); | ||
361 | kfree(swarn.msg_buf); | ||
362 | } | ||
363 | |||
364 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | ||
365 | { | ||
366 | struct page *page = NULL; | ||
367 | unsigned long index; | ||
368 | struct scrub_fixup_nodatasum *fixup = ctx; | ||
369 | int ret; | ||
370 | int corrected = 0; | ||
371 | struct btrfs_key key; | ||
372 | struct inode *inode = NULL; | ||
373 | u64 end = offset + PAGE_SIZE - 1; | ||
374 | struct btrfs_root *local_root; | ||
375 | |||
376 | key.objectid = root; | ||
377 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
378 | key.offset = (u64)-1; | ||
379 | local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); | ||
380 | if (IS_ERR(local_root)) | ||
381 | return PTR_ERR(local_root); | ||
382 | |||
383 | key.type = BTRFS_INODE_ITEM_KEY; | ||
384 | key.objectid = inum; | ||
385 | key.offset = 0; | ||
386 | inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); | ||
387 | if (IS_ERR(inode)) | ||
388 | return PTR_ERR(inode); | ||
389 | |||
390 | index = offset >> PAGE_CACHE_SHIFT; | ||
391 | |||
392 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
393 | if (!page) { | ||
394 | ret = -ENOMEM; | ||
395 | goto out; | ||
396 | } | ||
397 | |||
398 | if (PageUptodate(page)) { | ||
399 | struct btrfs_mapping_tree *map_tree; | ||
400 | if (PageDirty(page)) { | ||
401 | /* | ||
402 | * we need to write the data to the defect sector. the | ||
403 | * data that was in that sector is not in memory, | ||
404 | * because the page was modified. we must not write the | ||
405 | * modified page to that sector. | ||
406 | * | ||
407 | * TODO: what could be done here: wait for the delalloc | ||
408 | * runner to write out that page (might involve | ||
409 | * COW) and see whether the sector is still | ||
410 | * referenced afterwards. | ||
411 | * | ||
412 | * For the meantime, we'll treat this error | ||
413 | * incorrectable, although there is a chance that a | ||
414 | * later scrub will find the bad sector again and that | ||
415 | * there's no dirty page in memory, then. | ||
416 | */ | ||
417 | ret = -EIO; | ||
418 | goto out; | ||
419 | } | ||
420 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | ||
421 | ret = repair_io_failure(map_tree, offset, PAGE_SIZE, | ||
422 | fixup->logical, page, | ||
423 | fixup->mirror_num); | ||
424 | unlock_page(page); | ||
425 | corrected = !ret; | ||
426 | } else { | ||
427 | /* | ||
428 | * we need to get good data first. the general readpage path | ||
429 | * will call repair_io_failure for us, we just have to make | ||
430 | * sure we read the bad mirror. | ||
431 | */ | ||
432 | ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
433 | EXTENT_DAMAGED, GFP_NOFS); | ||
434 | if (ret) { | ||
435 | /* set_extent_bits should give proper error */ | ||
436 | WARN_ON(ret > 0); | ||
437 | if (ret > 0) | ||
438 | ret = -EFAULT; | ||
439 | goto out; | ||
440 | } | ||
441 | |||
442 | ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, | ||
443 | btrfs_get_extent, | ||
444 | fixup->mirror_num); | ||
445 | wait_on_page_locked(page); | ||
446 | |||
447 | corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, | ||
448 | end, EXTENT_DAMAGED, 0, NULL); | ||
449 | if (!corrected) | ||
450 | clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
451 | EXTENT_DAMAGED, GFP_NOFS); | ||
452 | } | ||
453 | |||
454 | out: | ||
455 | if (page) | ||
456 | put_page(page); | ||
457 | if (inode) | ||
458 | iput(inode); | ||
459 | |||
460 | if (ret < 0) | ||
461 | return ret; | ||
462 | |||
463 | if (ret == 0 && corrected) { | ||
464 | /* | ||
465 | * we only need to call readpage for one of the inodes belonging | ||
466 | * to this extent. so make iterate_extent_inodes stop | ||
467 | */ | ||
468 | return 1; | ||
469 | } | ||
470 | |||
471 | return -EIO; | ||
472 | } | ||
473 | |||
474 | static void scrub_fixup_nodatasum(struct btrfs_work *work) | ||
475 | { | ||
476 | int ret; | ||
477 | struct scrub_fixup_nodatasum *fixup; | ||
478 | struct scrub_dev *sdev; | ||
479 | struct btrfs_trans_handle *trans = NULL; | ||
480 | struct btrfs_fs_info *fs_info; | ||
481 | struct btrfs_path *path; | ||
482 | int uncorrectable = 0; | ||
483 | |||
484 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); | ||
485 | sdev = fixup->sdev; | ||
486 | fs_info = fixup->root->fs_info; | ||
487 | |||
488 | path = btrfs_alloc_path(); | ||
489 | if (!path) { | ||
490 | spin_lock(&sdev->stat_lock); | ||
491 | ++sdev->stat.malloc_errors; | ||
492 | spin_unlock(&sdev->stat_lock); | ||
493 | uncorrectable = 1; | ||
494 | goto out; | ||
495 | } | ||
496 | |||
497 | trans = btrfs_join_transaction(fixup->root); | ||
498 | if (IS_ERR(trans)) { | ||
499 | uncorrectable = 1; | ||
500 | goto out; | ||
501 | } | ||
502 | |||
503 | /* | ||
504 | * the idea is to trigger a regular read through the standard path. we | ||
505 | * read a page from the (failed) logical address by specifying the | ||
506 | * corresponding copynum of the failed sector. thus, that readpage is | ||
507 | * expected to fail. | ||
508 | * that is the point where on-the-fly error correction will kick in | ||
509 | * (once it's finished) and rewrite the failed sector if a good copy | ||
510 | * can be found. | ||
511 | */ | ||
512 | ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, | ||
513 | path, scrub_fixup_readpage, | ||
514 | fixup); | ||
515 | if (ret < 0) { | ||
516 | uncorrectable = 1; | ||
517 | goto out; | ||
518 | } | ||
519 | WARN_ON(ret != 1); | ||
520 | |||
521 | spin_lock(&sdev->stat_lock); | ||
522 | ++sdev->stat.corrected_errors; | ||
523 | spin_unlock(&sdev->stat_lock); | ||
524 | |||
525 | out: | ||
526 | if (trans && !IS_ERR(trans)) | ||
527 | btrfs_end_transaction(trans, fixup->root); | ||
528 | if (uncorrectable) { | ||
529 | spin_lock(&sdev->stat_lock); | ||
530 | ++sdev->stat.uncorrectable_errors; | ||
531 | spin_unlock(&sdev->stat_lock); | ||
532 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup " | ||
533 | "(nodatasum) error at logical %llu\n", | ||
534 | fixup->logical); | ||
535 | } | ||
536 | |||
537 | btrfs_free_path(path); | ||
538 | kfree(fixup); | ||
539 | |||
540 | /* see caller why we're pretending to be paused in the scrub counters */ | ||
541 | mutex_lock(&fs_info->scrub_lock); | ||
542 | atomic_dec(&fs_info->scrubs_running); | ||
543 | atomic_dec(&fs_info->scrubs_paused); | ||
544 | mutex_unlock(&fs_info->scrub_lock); | ||
545 | atomic_dec(&sdev->fixup_cnt); | ||
546 | wake_up(&fs_info->scrub_pause_wait); | ||
547 | wake_up(&sdev->list_wait); | ||
548 | } | ||
549 | |||
198 | /* | 550 | /* |
199 | * scrub_recheck_error gets called when either verification of the page | 551 | * scrub_recheck_error gets called when either verification of the page |
200 | * failed or the bio failed to read, e.g. with EIO. In the latter case, | 552 | * failed or the bio failed to read, e.g. with EIO. In the latter case, |
201 | * recheck_error gets called for every page in the bio, even though only | 553 | * recheck_error gets called for every page in the bio, even though only |
202 | * one may be bad | 554 | * one may be bad |
203 | */ | 555 | */ |
204 | static void scrub_recheck_error(struct scrub_bio *sbio, int ix) | 556 | static int scrub_recheck_error(struct scrub_bio *sbio, int ix) |
205 | { | 557 | { |
558 | struct scrub_dev *sdev = sbio->sdev; | ||
559 | u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
560 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
561 | DEFAULT_RATELIMIT_BURST); | ||
562 | |||
206 | if (sbio->err) { | 563 | if (sbio->err) { |
207 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, | 564 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, |
208 | (sbio->physical + ix * PAGE_SIZE) >> 9, | ||
209 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { | 565 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { |
210 | if (scrub_fixup_check(sbio, ix) == 0) | 566 | if (scrub_fixup_check(sbio, ix) == 0) |
211 | return; | 567 | return 0; |
212 | } | 568 | } |
569 | if (__ratelimit(&_rs)) | ||
570 | scrub_print_warning("i/o error", sbio, ix); | ||
571 | } else { | ||
572 | if (__ratelimit(&_rs)) | ||
573 | scrub_print_warning("checksum error", sbio, ix); | ||
213 | } | 574 | } |
214 | 575 | ||
576 | spin_lock(&sdev->stat_lock); | ||
577 | ++sdev->stat.read_errors; | ||
578 | spin_unlock(&sdev->stat_lock); | ||
579 | |||
215 | scrub_fixup(sbio, ix); | 580 | scrub_fixup(sbio, ix); |
581 | return 1; | ||
216 | } | 582 | } |
217 | 583 | ||
218 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) | 584 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) |
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
250 | struct scrub_dev *sdev = sbio->sdev; | 616 | struct scrub_dev *sdev = sbio->sdev; |
251 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 617 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; |
252 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 618 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
253 | struct btrfs_multi_bio *multi = NULL; | 619 | struct btrfs_bio *bbio = NULL; |
620 | struct scrub_fixup_nodatasum *fixup; | ||
254 | u64 logical = sbio->logical + ix * PAGE_SIZE; | 621 | u64 logical = sbio->logical + ix * PAGE_SIZE; |
255 | u64 length; | 622 | u64 length; |
256 | int i; | 623 | int i; |
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
259 | 626 | ||
260 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && | 627 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && |
261 | (sbio->spag[ix].have_csum == 0)) { | 628 | (sbio->spag[ix].have_csum == 0)) { |
629 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
630 | if (!fixup) | ||
631 | goto uncorrectable; | ||
632 | fixup->sdev = sdev; | ||
633 | fixup->logical = logical; | ||
634 | fixup->root = fs_info->extent_root; | ||
635 | fixup->mirror_num = sbio->spag[ix].mirror_num; | ||
262 | /* | 636 | /* |
263 | * nodatasum, don't try to fix anything | 637 | * increment scrubs_running to prevent cancel requests from |
264 | * FIXME: we can do better, open the inode and trigger a | 638 | * completing as long as a fixup worker is running. we must also |
265 | * writeback | 639 | * increment scrubs_paused to prevent deadlocking on pause |
640 | * requests used for transactions commits (as the worker uses a | ||
641 | * transaction context). it is safe to regard the fixup worker | ||
642 | * as paused for all matters practical. effectively, we only | ||
643 | * avoid cancellation requests from completing. | ||
266 | */ | 644 | */ |
267 | goto uncorrectable; | 645 | mutex_lock(&fs_info->scrub_lock); |
646 | atomic_inc(&fs_info->scrubs_running); | ||
647 | atomic_inc(&fs_info->scrubs_paused); | ||
648 | mutex_unlock(&fs_info->scrub_lock); | ||
649 | atomic_inc(&sdev->fixup_cnt); | ||
650 | fixup->work.func = scrub_fixup_nodatasum; | ||
651 | btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); | ||
652 | return; | ||
268 | } | 653 | } |
269 | 654 | ||
270 | length = PAGE_SIZE; | 655 | length = PAGE_SIZE; |
271 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, | 656 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, |
272 | &multi, 0); | 657 | &bbio, 0); |
273 | if (ret || !multi || length < PAGE_SIZE) { | 658 | if (ret || !bbio || length < PAGE_SIZE) { |
274 | printk(KERN_ERR | 659 | printk(KERN_ERR |
275 | "scrub_fixup: btrfs_map_block failed us for %llu\n", | 660 | "scrub_fixup: btrfs_map_block failed us for %llu\n", |
276 | (unsigned long long)logical); | 661 | (unsigned long long)logical); |
277 | WARN_ON(1); | 662 | WARN_ON(1); |
663 | kfree(bbio); | ||
278 | return; | 664 | return; |
279 | } | 665 | } |
280 | 666 | ||
281 | if (multi->num_stripes == 1) | 667 | if (bbio->num_stripes == 1) |
282 | /* there aren't any replicas */ | 668 | /* there aren't any replicas */ |
283 | goto uncorrectable; | 669 | goto uncorrectable; |
284 | 670 | ||
285 | /* | 671 | /* |
286 | * first find a good copy | 672 | * first find a good copy |
287 | */ | 673 | */ |
288 | for (i = 0; i < multi->num_stripes; ++i) { | 674 | for (i = 0; i < bbio->num_stripes; ++i) { |
289 | if (i == sbio->spag[ix].mirror_num) | 675 | if (i + 1 == sbio->spag[ix].mirror_num) |
290 | continue; | 676 | continue; |
291 | 677 | ||
292 | if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, | 678 | if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, |
293 | multi->stripes[i].physical >> 9, | 679 | bbio->stripes[i].physical >> 9, |
294 | sbio->bio->bi_io_vec[ix].bv_page)) { | 680 | sbio->bio->bi_io_vec[ix].bv_page)) { |
295 | /* I/O-error, this is not a good copy */ | 681 | /* I/O-error, this is not a good copy */ |
296 | continue; | 682 | continue; |
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
299 | if (scrub_fixup_check(sbio, ix) == 0) | 685 | if (scrub_fixup_check(sbio, ix) == 0) |
300 | break; | 686 | break; |
301 | } | 687 | } |
302 | if (i == multi->num_stripes) | 688 | if (i == bbio->num_stripes) |
303 | goto uncorrectable; | 689 | goto uncorrectable; |
304 | 690 | ||
305 | if (!sdev->readonly) { | 691 | if (!sdev->readonly) { |
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
314 | } | 700 | } |
315 | } | 701 | } |
316 | 702 | ||
317 | kfree(multi); | 703 | kfree(bbio); |
318 | spin_lock(&sdev->stat_lock); | 704 | spin_lock(&sdev->stat_lock); |
319 | ++sdev->stat.corrected_errors; | 705 | ++sdev->stat.corrected_errors; |
320 | spin_unlock(&sdev->stat_lock); | 706 | spin_unlock(&sdev->stat_lock); |
321 | 707 | ||
322 | if (printk_ratelimit()) | 708 | printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", |
323 | printk(KERN_ERR "btrfs: fixed up at %llu\n", | 709 | (unsigned long long)logical); |
324 | (unsigned long long)logical); | ||
325 | return; | 710 | return; |
326 | 711 | ||
327 | uncorrectable: | 712 | uncorrectable: |
328 | kfree(multi); | 713 | kfree(bbio); |
329 | spin_lock(&sdev->stat_lock); | 714 | spin_lock(&sdev->stat_lock); |
330 | ++sdev->stat.uncorrectable_errors; | 715 | ++sdev->stat.uncorrectable_errors; |
331 | spin_unlock(&sdev->stat_lock); | 716 | spin_unlock(&sdev->stat_lock); |
332 | 717 | ||
333 | if (printk_ratelimit()) | 718 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " |
334 | printk(KERN_ERR "btrfs: unable to fixup at %llu\n", | 719 | "logical %llu\n", (unsigned long long)logical); |
335 | (unsigned long long)logical); | ||
336 | } | 720 | } |
337 | 721 | ||
338 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | 722 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, |
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
382 | int ret; | 766 | int ret; |
383 | 767 | ||
384 | if (sbio->err) { | 768 | if (sbio->err) { |
769 | ret = 0; | ||
385 | for (i = 0; i < sbio->count; ++i) | 770 | for (i = 0; i < sbio->count; ++i) |
386 | scrub_recheck_error(sbio, i); | 771 | ret |= scrub_recheck_error(sbio, i); |
772 | if (!ret) { | ||
773 | spin_lock(&sdev->stat_lock); | ||
774 | ++sdev->stat.unverified_errors; | ||
775 | spin_unlock(&sdev->stat_lock); | ||
776 | } | ||
387 | 777 | ||
388 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); | 778 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); |
389 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; | 779 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; |
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work) | |||
396 | bi->bv_offset = 0; | 786 | bi->bv_offset = 0; |
397 | bi->bv_len = PAGE_SIZE; | 787 | bi->bv_len = PAGE_SIZE; |
398 | } | 788 | } |
399 | |||
400 | spin_lock(&sdev->stat_lock); | ||
401 | ++sdev->stat.read_errors; | ||
402 | spin_unlock(&sdev->stat_lock); | ||
403 | goto out; | 789 | goto out; |
404 | } | 790 | } |
405 | for (i = 0; i < sbio->count; ++i) { | 791 | for (i = 0; i < sbio->count; ++i) { |
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
420 | WARN_ON(1); | 806 | WARN_ON(1); |
421 | } | 807 | } |
422 | kunmap_atomic(buffer, KM_USER0); | 808 | kunmap_atomic(buffer, KM_USER0); |
423 | if (ret) | 809 | if (ret) { |
424 | scrub_recheck_error(sbio, i); | 810 | ret = scrub_recheck_error(sbio, i); |
811 | if (!ret) { | ||
812 | spin_lock(&sdev->stat_lock); | ||
813 | ++sdev->stat.unverified_errors; | ||
814 | spin_unlock(&sdev->stat_lock); | ||
815 | } | ||
816 | } | ||
425 | } | 817 | } |
426 | 818 | ||
427 | out: | 819 | out: |
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) | |||
557 | static int scrub_submit(struct scrub_dev *sdev) | 949 | static int scrub_submit(struct scrub_dev *sdev) |
558 | { | 950 | { |
559 | struct scrub_bio *sbio; | 951 | struct scrub_bio *sbio; |
560 | struct bio *bio; | ||
561 | int i; | ||
562 | 952 | ||
563 | if (sdev->curr == -1) | 953 | if (sdev->curr == -1) |
564 | return 0; | 954 | return 0; |
565 | 955 | ||
566 | sbio = sdev->bios[sdev->curr]; | 956 | sbio = sdev->bios[sdev->curr]; |
567 | |||
568 | bio = bio_alloc(GFP_NOFS, sbio->count); | ||
569 | if (!bio) | ||
570 | goto nomem; | ||
571 | |||
572 | bio->bi_private = sbio; | ||
573 | bio->bi_end_io = scrub_bio_end_io; | ||
574 | bio->bi_bdev = sdev->dev->bdev; | ||
575 | bio->bi_sector = sbio->physical >> 9; | ||
576 | |||
577 | for (i = 0; i < sbio->count; ++i) { | ||
578 | struct page *page; | ||
579 | int ret; | ||
580 | |||
581 | page = alloc_page(GFP_NOFS); | ||
582 | if (!page) | ||
583 | goto nomem; | ||
584 | |||
585 | ret = bio_add_page(bio, page, PAGE_SIZE, 0); | ||
586 | if (!ret) { | ||
587 | __free_page(page); | ||
588 | goto nomem; | ||
589 | } | ||
590 | } | ||
591 | |||
592 | sbio->err = 0; | 957 | sbio->err = 0; |
593 | sdev->curr = -1; | 958 | sdev->curr = -1; |
594 | atomic_inc(&sdev->in_flight); | 959 | atomic_inc(&sdev->in_flight); |
595 | 960 | ||
596 | submit_bio(READ, bio); | 961 | submit_bio(READ, sbio->bio); |
597 | 962 | ||
598 | return 0; | 963 | return 0; |
599 | |||
600 | nomem: | ||
601 | scrub_free_bio(bio); | ||
602 | |||
603 | return -ENOMEM; | ||
604 | } | 964 | } |
605 | 965 | ||
606 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, | 966 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, |
607 | u64 physical, u64 flags, u64 gen, u64 mirror_num, | 967 | u64 physical, u64 flags, u64 gen, int mirror_num, |
608 | u8 *csum, int force) | 968 | u8 *csum, int force) |
609 | { | 969 | { |
610 | struct scrub_bio *sbio; | 970 | struct scrub_bio *sbio; |
971 | struct page *page; | ||
972 | int ret; | ||
611 | 973 | ||
612 | again: | 974 | again: |
613 | /* | 975 | /* |
@@ -628,12 +990,22 @@ again: | |||
628 | } | 990 | } |
629 | sbio = sdev->bios[sdev->curr]; | 991 | sbio = sdev->bios[sdev->curr]; |
630 | if (sbio->count == 0) { | 992 | if (sbio->count == 0) { |
993 | struct bio *bio; | ||
994 | |||
631 | sbio->physical = physical; | 995 | sbio->physical = physical; |
632 | sbio->logical = logical; | 996 | sbio->logical = logical; |
997 | bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); | ||
998 | if (!bio) | ||
999 | return -ENOMEM; | ||
1000 | |||
1001 | bio->bi_private = sbio; | ||
1002 | bio->bi_end_io = scrub_bio_end_io; | ||
1003 | bio->bi_bdev = sdev->dev->bdev; | ||
1004 | bio->bi_sector = sbio->physical >> 9; | ||
1005 | sbio->err = 0; | ||
1006 | sbio->bio = bio; | ||
633 | } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || | 1007 | } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || |
634 | sbio->logical + sbio->count * PAGE_SIZE != logical) { | 1008 | sbio->logical + sbio->count * PAGE_SIZE != logical) { |
635 | int ret; | ||
636 | |||
637 | ret = scrub_submit(sdev); | 1009 | ret = scrub_submit(sdev); |
638 | if (ret) | 1010 | if (ret) |
639 | return ret; | 1011 | return ret; |
@@ -643,6 +1015,20 @@ again: | |||
643 | sbio->spag[sbio->count].generation = gen; | 1015 | sbio->spag[sbio->count].generation = gen; |
644 | sbio->spag[sbio->count].have_csum = 0; | 1016 | sbio->spag[sbio->count].have_csum = 0; |
645 | sbio->spag[sbio->count].mirror_num = mirror_num; | 1017 | sbio->spag[sbio->count].mirror_num = mirror_num; |
1018 | |||
1019 | page = alloc_page(GFP_NOFS); | ||
1020 | if (!page) | ||
1021 | return -ENOMEM; | ||
1022 | |||
1023 | ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); | ||
1024 | if (!ret) { | ||
1025 | __free_page(page); | ||
1026 | ret = scrub_submit(sdev); | ||
1027 | if (ret) | ||
1028 | return ret; | ||
1029 | goto again; | ||
1030 | } | ||
1031 | |||
646 | if (csum) { | 1032 | if (csum) { |
647 | sbio->spag[sbio->count].have_csum = 1; | 1033 | sbio->spag[sbio->count].have_csum = 1; |
648 | memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); | 1034 | memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); |
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
701 | 1087 | ||
702 | /* scrub extent tries to collect up to 64 kB for each bio */ | 1088 | /* scrub extent tries to collect up to 64 kB for each bio */ |
703 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | 1089 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, |
704 | u64 physical, u64 flags, u64 gen, u64 mirror_num) | 1090 | u64 physical, u64 flags, u64 gen, int mirror_num) |
705 | { | 1091 | { |
706 | int ret; | 1092 | int ret; |
707 | u8 csum[BTRFS_CSUM_SIZE]; | 1093 | u8 csum[BTRFS_CSUM_SIZE]; |
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
741 | int slot; | 1127 | int slot; |
742 | int i; | 1128 | int i; |
743 | u64 nstripes; | 1129 | u64 nstripes; |
744 | int start_stripe; | ||
745 | struct extent_buffer *l; | 1130 | struct extent_buffer *l; |
746 | struct btrfs_key key; | 1131 | struct btrfs_key key; |
747 | u64 physical; | 1132 | u64 physical; |
748 | u64 logical; | 1133 | u64 logical; |
749 | u64 generation; | 1134 | u64 generation; |
750 | u64 mirror_num; | 1135 | int mirror_num; |
1136 | struct reada_control *reada1; | ||
1137 | struct reada_control *reada2; | ||
1138 | struct btrfs_key key_start; | ||
1139 | struct btrfs_key key_end; | ||
751 | 1140 | ||
752 | u64 increment = map->stripe_len; | 1141 | u64 increment = map->stripe_len; |
753 | u64 offset; | 1142 | u64 offset; |
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
758 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 1147 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
759 | offset = map->stripe_len * num; | 1148 | offset = map->stripe_len * num; |
760 | increment = map->stripe_len * map->num_stripes; | 1149 | increment = map->stripe_len * map->num_stripes; |
761 | mirror_num = 0; | 1150 | mirror_num = 1; |
762 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | 1151 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
763 | int factor = map->num_stripes / map->sub_stripes; | 1152 | int factor = map->num_stripes / map->sub_stripes; |
764 | offset = map->stripe_len * (num / map->sub_stripes); | 1153 | offset = map->stripe_len * (num / map->sub_stripes); |
765 | increment = map->stripe_len * factor; | 1154 | increment = map->stripe_len * factor; |
766 | mirror_num = num % map->sub_stripes; | 1155 | mirror_num = num % map->sub_stripes + 1; |
767 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 1156 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
768 | increment = map->stripe_len; | 1157 | increment = map->stripe_len; |
769 | mirror_num = num % map->num_stripes; | 1158 | mirror_num = num % map->num_stripes + 1; |
770 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 1159 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
771 | increment = map->stripe_len; | 1160 | increment = map->stripe_len; |
772 | mirror_num = num % map->num_stripes; | 1161 | mirror_num = num % map->num_stripes + 1; |
773 | } else { | 1162 | } else { |
774 | increment = map->stripe_len; | 1163 | increment = map->stripe_len; |
775 | mirror_num = 0; | 1164 | mirror_num = 1; |
776 | } | 1165 | } |
777 | 1166 | ||
778 | path = btrfs_alloc_path(); | 1167 | path = btrfs_alloc_path(); |
779 | if (!path) | 1168 | if (!path) |
780 | return -ENOMEM; | 1169 | return -ENOMEM; |
781 | 1170 | ||
782 | path->reada = 2; | ||
783 | path->search_commit_root = 1; | 1171 | path->search_commit_root = 1; |
784 | path->skip_locking = 1; | 1172 | path->skip_locking = 1; |
785 | 1173 | ||
786 | /* | 1174 | /* |
787 | * find all extents for each stripe and just read them to get | 1175 | * trigger the readahead for extent tree csum tree and wait for |
788 | * them into the page cache | 1176 | * completion. During readahead, the scrub is officially paused |
789 | * FIXME: we can do better. build a more intelligent prefetching | 1177 | * to not hold off transaction commits |
790 | */ | 1178 | */ |
791 | logical = base + offset; | 1179 | logical = base + offset; |
792 | physical = map->stripes[num].physical; | ||
793 | ret = 0; | ||
794 | for (i = 0; i < nstripes; ++i) { | ||
795 | key.objectid = logical; | ||
796 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
797 | key.offset = (u64)0; | ||
798 | |||
799 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
800 | if (ret < 0) | ||
801 | goto out_noplug; | ||
802 | |||
803 | /* | ||
804 | * we might miss half an extent here, but that doesn't matter, | ||
805 | * as it's only the prefetch | ||
806 | */ | ||
807 | while (1) { | ||
808 | l = path->nodes[0]; | ||
809 | slot = path->slots[0]; | ||
810 | if (slot >= btrfs_header_nritems(l)) { | ||
811 | ret = btrfs_next_leaf(root, path); | ||
812 | if (ret == 0) | ||
813 | continue; | ||
814 | if (ret < 0) | ||
815 | goto out_noplug; | ||
816 | 1180 | ||
817 | break; | 1181 | wait_event(sdev->list_wait, |
818 | } | 1182 | atomic_read(&sdev->in_flight) == 0); |
819 | btrfs_item_key_to_cpu(l, &key, slot); | 1183 | atomic_inc(&fs_info->scrubs_paused); |
1184 | wake_up(&fs_info->scrub_pause_wait); | ||
820 | 1185 | ||
821 | if (key.objectid >= logical + map->stripe_len) | 1186 | /* FIXME it might be better to start readahead at commit root */ |
822 | break; | 1187 | key_start.objectid = logical; |
1188 | key_start.type = BTRFS_EXTENT_ITEM_KEY; | ||
1189 | key_start.offset = (u64)0; | ||
1190 | key_end.objectid = base + offset + nstripes * increment; | ||
1191 | key_end.type = BTRFS_EXTENT_ITEM_KEY; | ||
1192 | key_end.offset = (u64)0; | ||
1193 | reada1 = btrfs_reada_add(root, &key_start, &key_end); | ||
1194 | |||
1195 | key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
1196 | key_start.type = BTRFS_EXTENT_CSUM_KEY; | ||
1197 | key_start.offset = logical; | ||
1198 | key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
1199 | key_end.type = BTRFS_EXTENT_CSUM_KEY; | ||
1200 | key_end.offset = base + offset + nstripes * increment; | ||
1201 | reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); | ||
1202 | |||
1203 | if (!IS_ERR(reada1)) | ||
1204 | btrfs_reada_wait(reada1); | ||
1205 | if (!IS_ERR(reada2)) | ||
1206 | btrfs_reada_wait(reada2); | ||
823 | 1207 | ||
824 | path->slots[0]++; | 1208 | mutex_lock(&fs_info->scrub_lock); |
825 | } | 1209 | while (atomic_read(&fs_info->scrub_pause_req)) { |
826 | btrfs_release_path(path); | 1210 | mutex_unlock(&fs_info->scrub_lock); |
827 | logical += increment; | 1211 | wait_event(fs_info->scrub_pause_wait, |
828 | physical += map->stripe_len; | 1212 | atomic_read(&fs_info->scrub_pause_req) == 0); |
829 | cond_resched(); | 1213 | mutex_lock(&fs_info->scrub_lock); |
830 | } | 1214 | } |
1215 | atomic_dec(&fs_info->scrubs_paused); | ||
1216 | mutex_unlock(&fs_info->scrub_lock); | ||
1217 | wake_up(&fs_info->scrub_pause_wait); | ||
831 | 1218 | ||
832 | /* | 1219 | /* |
833 | * collect all data csums for the stripe to avoid seeking during | 1220 | * collect all data csums for the stripe to avoid seeking during |
834 | * the scrub. This might currently (crc32) end up to be about 1MB | 1221 | * the scrub. This might currently (crc32) end up to be about 1MB |
835 | */ | 1222 | */ |
836 | start_stripe = 0; | ||
837 | blk_start_plug(&plug); | 1223 | blk_start_plug(&plug); |
838 | again: | ||
839 | logical = base + offset + start_stripe * increment; | ||
840 | for (i = start_stripe; i < nstripes; ++i) { | ||
841 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
842 | logical + map->stripe_len - 1, | ||
843 | &sdev->csum_list, 1); | ||
844 | if (ret) | ||
845 | goto out; | ||
846 | 1224 | ||
847 | logical += increment; | ||
848 | cond_resched(); | ||
849 | } | ||
850 | /* | 1225 | /* |
851 | * now find all extents for each stripe and scrub them | 1226 | * now find all extents for each stripe and scrub them |
852 | */ | 1227 | */ |
853 | logical = base + offset + start_stripe * increment; | 1228 | logical = base + offset; |
854 | physical = map->stripes[num].physical + start_stripe * map->stripe_len; | 1229 | physical = map->stripes[num].physical; |
855 | ret = 0; | 1230 | ret = 0; |
856 | for (i = start_stripe; i < nstripes; ++i) { | 1231 | for (i = 0; i < nstripes; ++i) { |
857 | /* | 1232 | /* |
858 | * canceled? | 1233 | * canceled? |
859 | */ | 1234 | */ |
@@ -882,11 +1257,14 @@ again: | |||
882 | atomic_dec(&fs_info->scrubs_paused); | 1257 | atomic_dec(&fs_info->scrubs_paused); |
883 | mutex_unlock(&fs_info->scrub_lock); | 1258 | mutex_unlock(&fs_info->scrub_lock); |
884 | wake_up(&fs_info->scrub_pause_wait); | 1259 | wake_up(&fs_info->scrub_pause_wait); |
885 | scrub_free_csums(sdev); | ||
886 | start_stripe = i; | ||
887 | goto again; | ||
888 | } | 1260 | } |
889 | 1261 | ||
1262 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
1263 | logical + map->stripe_len - 1, | ||
1264 | &sdev->csum_list, 1); | ||
1265 | if (ret) | ||
1266 | goto out; | ||
1267 | |||
890 | key.objectid = logical; | 1268 | key.objectid = logical; |
891 | key.type = BTRFS_EXTENT_ITEM_KEY; | 1269 | key.type = BTRFS_EXTENT_ITEM_KEY; |
892 | key.offset = (u64)0; | 1270 | key.offset = (u64)0; |
@@ -982,7 +1360,6 @@ next: | |||
982 | 1360 | ||
983 | out: | 1361 | out: |
984 | blk_finish_plug(&plug); | 1362 | blk_finish_plug(&plug); |
985 | out_noplug: | ||
986 | btrfs_free_path(path); | 1363 | btrfs_free_path(path); |
987 | return ret < 0 ? ret : 0; | 1364 | return ret < 0 ? ret : 0; |
988 | } | 1365 | } |
@@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
1253 | ret = scrub_enumerate_chunks(sdev, start, end); | 1630 | ret = scrub_enumerate_chunks(sdev, start, end); |
1254 | 1631 | ||
1255 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 1632 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); |
1256 | |||
1257 | atomic_dec(&fs_info->scrubs_running); | 1633 | atomic_dec(&fs_info->scrubs_running); |
1258 | wake_up(&fs_info->scrub_pause_wait); | 1634 | wake_up(&fs_info->scrub_pause_wait); |
1259 | 1635 | ||
1636 | wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); | ||
1637 | |||
1260 | if (progress) | 1638 | if (progress) |
1261 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 1639 | memcpy(progress, &sdev->stat, sizeof(*progress)); |
1262 | 1640 | ||
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 15634d4648d..e28ad4baf48 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/magic.h> | 40 | #include <linux/magic.h> |
41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
42 | #include <linux/cleancache.h> | 42 | #include <linux/cleancache.h> |
43 | #include <linux/mnt_namespace.h> | ||
43 | #include "compat.h" | 44 | #include "compat.h" |
44 | #include "delayed-inode.h" | 45 | #include "delayed-inode.h" |
45 | #include "ctree.h" | 46 | #include "ctree.h" |
@@ -58,6 +59,7 @@ | |||
58 | #include <trace/events/btrfs.h> | 59 | #include <trace/events/btrfs.h> |
59 | 60 | ||
60 | static const struct super_operations btrfs_super_ops; | 61 | static const struct super_operations btrfs_super_ops; |
62 | static struct file_system_type btrfs_fs_type; | ||
61 | 63 | ||
62 | static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, | 64 | static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, |
63 | char nbuf[16]) | 65 | char nbuf[16]) |
@@ -162,7 +164,7 @@ enum { | |||
162 | Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, | 164 | Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, |
163 | Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, | 165 | Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, |
164 | Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, | 166 | Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, |
165 | Opt_inode_cache, Opt_err, | 167 | Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, |
166 | }; | 168 | }; |
167 | 169 | ||
168 | static match_table_t tokens = { | 170 | static match_table_t tokens = { |
@@ -195,6 +197,8 @@ static match_table_t tokens = { | |||
195 | {Opt_subvolrootid, "subvolrootid=%d"}, | 197 | {Opt_subvolrootid, "subvolrootid=%d"}, |
196 | {Opt_defrag, "autodefrag"}, | 198 | {Opt_defrag, "autodefrag"}, |
197 | {Opt_inode_cache, "inode_cache"}, | 199 | {Opt_inode_cache, "inode_cache"}, |
200 | {Opt_no_space_cache, "nospace_cache"}, | ||
201 | {Opt_recovery, "recovery"}, | ||
198 | {Opt_err, NULL}, | 202 | {Opt_err, NULL}, |
199 | }; | 203 | }; |
200 | 204 | ||
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
206 | { | 210 | { |
207 | struct btrfs_fs_info *info = root->fs_info; | 211 | struct btrfs_fs_info *info = root->fs_info; |
208 | substring_t args[MAX_OPT_ARGS]; | 212 | substring_t args[MAX_OPT_ARGS]; |
209 | char *p, *num, *orig; | 213 | char *p, *num, *orig = NULL; |
214 | u64 cache_gen; | ||
210 | int intarg; | 215 | int intarg; |
211 | int ret = 0; | 216 | int ret = 0; |
212 | char *compress_type; | 217 | char *compress_type; |
213 | bool compress_force = false; | 218 | bool compress_force = false; |
214 | 219 | ||
220 | cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); | ||
221 | if (cache_gen) | ||
222 | btrfs_set_opt(info->mount_opt, SPACE_CACHE); | ||
223 | |||
215 | if (!options) | 224 | if (!options) |
216 | return 0; | 225 | goto out; |
217 | 226 | ||
218 | /* | 227 | /* |
219 | * strsep changes the string, duplicate it because parse_options | 228 | * strsep changes the string, duplicate it because parse_options |
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
360 | btrfs_set_opt(info->mount_opt, DISCARD); | 369 | btrfs_set_opt(info->mount_opt, DISCARD); |
361 | break; | 370 | break; |
362 | case Opt_space_cache: | 371 | case Opt_space_cache: |
363 | printk(KERN_INFO "btrfs: enabling disk space caching\n"); | ||
364 | btrfs_set_opt(info->mount_opt, SPACE_CACHE); | 372 | btrfs_set_opt(info->mount_opt, SPACE_CACHE); |
365 | break; | 373 | break; |
374 | case Opt_no_space_cache: | ||
375 | printk(KERN_INFO "btrfs: disabling disk space caching\n"); | ||
376 | btrfs_clear_opt(info->mount_opt, SPACE_CACHE); | ||
377 | break; | ||
366 | case Opt_inode_cache: | 378 | case Opt_inode_cache: |
367 | printk(KERN_INFO "btrfs: enabling inode map caching\n"); | 379 | printk(KERN_INFO "btrfs: enabling inode map caching\n"); |
368 | btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); | 380 | btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); |
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
381 | printk(KERN_INFO "btrfs: enabling auto defrag"); | 393 | printk(KERN_INFO "btrfs: enabling auto defrag"); |
382 | btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); | 394 | btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); |
383 | break; | 395 | break; |
396 | case Opt_recovery: | ||
397 | printk(KERN_INFO "btrfs: enabling auto recovery"); | ||
398 | btrfs_set_opt(info->mount_opt, RECOVERY); | ||
399 | break; | ||
384 | case Opt_err: | 400 | case Opt_err: |
385 | printk(KERN_INFO "btrfs: unrecognized mount option " | 401 | printk(KERN_INFO "btrfs: unrecognized mount option " |
386 | "'%s'\n", p); | 402 | "'%s'\n", p); |
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
391 | } | 407 | } |
392 | } | 408 | } |
393 | out: | 409 | out: |
410 | if (!ret && btrfs_test_opt(root, SPACE_CACHE)) | ||
411 | printk(KERN_INFO "btrfs: disk space caching is enabled\n"); | ||
394 | kfree(orig); | 412 | kfree(orig); |
395 | return ret; | 413 | return ret; |
396 | } | 414 | } |
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, | |||
406 | u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) | 424 | u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) |
407 | { | 425 | { |
408 | substring_t args[MAX_OPT_ARGS]; | 426 | substring_t args[MAX_OPT_ARGS]; |
409 | char *opts, *orig, *p; | 427 | char *device_name, *opts, *orig, *p; |
410 | int error = 0; | 428 | int error = 0; |
411 | int intarg; | 429 | int intarg; |
412 | 430 | ||
413 | if (!options) | 431 | if (!options) |
414 | goto out; | 432 | return 0; |
415 | 433 | ||
416 | /* | 434 | /* |
417 | * strsep changes the string, duplicate it because parse_options | 435 | * strsep changes the string, duplicate it because parse_options |
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, | |||
430 | token = match_token(p, tokens, args); | 448 | token = match_token(p, tokens, args); |
431 | switch (token) { | 449 | switch (token) { |
432 | case Opt_subvol: | 450 | case Opt_subvol: |
451 | kfree(*subvol_name); | ||
433 | *subvol_name = match_strdup(&args[0]); | 452 | *subvol_name = match_strdup(&args[0]); |
434 | break; | 453 | break; |
435 | case Opt_subvolid: | 454 | case Opt_subvolid: |
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, | |||
457 | } | 476 | } |
458 | break; | 477 | break; |
459 | case Opt_device: | 478 | case Opt_device: |
460 | error = btrfs_scan_one_device(match_strdup(&args[0]), | 479 | device_name = match_strdup(&args[0]); |
480 | if (!device_name) { | ||
481 | error = -ENOMEM; | ||
482 | goto out; | ||
483 | } | ||
484 | error = btrfs_scan_one_device(device_name, | ||
461 | flags, holder, fs_devices); | 485 | flags, holder, fs_devices); |
486 | kfree(device_name); | ||
462 | if (error) | 487 | if (error) |
463 | goto out_free_opts; | 488 | goto out; |
464 | break; | 489 | break; |
465 | default: | 490 | default: |
466 | break; | 491 | break; |
467 | } | 492 | } |
468 | } | 493 | } |
469 | 494 | ||
470 | out_free_opts: | 495 | out: |
471 | kfree(orig); | 496 | kfree(orig); |
472 | out: | ||
473 | /* | ||
474 | * If no subvolume name is specified we use the default one. Allocate | ||
475 | * a copy of the string "." here so that code later in the | ||
476 | * mount path doesn't care if it's the default volume or another one. | ||
477 | */ | ||
478 | if (!*subvol_name) { | ||
479 | *subvol_name = kstrdup(".", GFP_KERNEL); | ||
480 | if (!*subvol_name) | ||
481 | return -ENOMEM; | ||
482 | } | ||
483 | return error; | 497 | return error; |
484 | } | 498 | } |
485 | 499 | ||
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb, | |||
492 | struct btrfs_path *path; | 506 | struct btrfs_path *path; |
493 | struct btrfs_key location; | 507 | struct btrfs_key location; |
494 | struct inode *inode; | 508 | struct inode *inode; |
495 | struct dentry *dentry; | ||
496 | u64 dir_id; | 509 | u64 dir_id; |
497 | int new = 0; | 510 | int new = 0; |
498 | 511 | ||
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb, | |||
517 | * will mount by default if we haven't been given a specific subvolume | 530 | * will mount by default if we haven't been given a specific subvolume |
518 | * to mount. | 531 | * to mount. |
519 | */ | 532 | */ |
520 | dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); | 533 | dir_id = btrfs_super_root_dir(root->fs_info->super_copy); |
521 | di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); | 534 | di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); |
522 | if (IS_ERR(di)) { | 535 | if (IS_ERR(di)) { |
523 | btrfs_free_path(path); | 536 | btrfs_free_path(path); |
@@ -566,29 +579,7 @@ setup_root: | |||
566 | return dget(sb->s_root); | 579 | return dget(sb->s_root); |
567 | } | 580 | } |
568 | 581 | ||
569 | if (new) { | 582 | return d_obtain_alias(inode); |
570 | const struct qstr name = { .name = "/", .len = 1 }; | ||
571 | |||
572 | /* | ||
573 | * New inode, we need to make the dentry a sibling of s_root so | ||
574 | * everything gets cleaned up properly on unmount. | ||
575 | */ | ||
576 | dentry = d_alloc(sb->s_root, &name); | ||
577 | if (!dentry) { | ||
578 | iput(inode); | ||
579 | return ERR_PTR(-ENOMEM); | ||
580 | } | ||
581 | d_splice_alias(inode, dentry); | ||
582 | } else { | ||
583 | /* | ||
584 | * We found the inode in cache, just find a dentry for it and | ||
585 | * put the reference to the inode we just got. | ||
586 | */ | ||
587 | dentry = d_find_alias(inode); | ||
588 | iput(inode); | ||
589 | } | ||
590 | |||
591 | return dentry; | ||
592 | } | 583 | } |
593 | 584 | ||
594 | static int btrfs_fill_super(struct super_block *sb, | 585 | static int btrfs_fill_super(struct super_block *sb, |
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
719 | seq_puts(seq, ",noacl"); | 710 | seq_puts(seq, ",noacl"); |
720 | if (btrfs_test_opt(root, SPACE_CACHE)) | 711 | if (btrfs_test_opt(root, SPACE_CACHE)) |
721 | seq_puts(seq, ",space_cache"); | 712 | seq_puts(seq, ",space_cache"); |
713 | else | ||
714 | seq_puts(seq, ",nospace_cache"); | ||
722 | if (btrfs_test_opt(root, CLEAR_CACHE)) | 715 | if (btrfs_test_opt(root, CLEAR_CACHE)) |
723 | seq_puts(seq, ",clear_cache"); | 716 | seq_puts(seq, ",clear_cache"); |
724 | if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) | 717 | if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) |
@@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data) | |||
753 | return set_anon_super(s, data); | 746 | return set_anon_super(s, data); |
754 | } | 747 | } |
755 | 748 | ||
749 | /* | ||
750 | * subvolumes are identified by ino 256 | ||
751 | */ | ||
752 | static inline int is_subvolume_inode(struct inode *inode) | ||
753 | { | ||
754 | if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) | ||
755 | return 1; | ||
756 | return 0; | ||
757 | } | ||
758 | |||
759 | /* | ||
760 | * This will strip out the subvol=%s argument for an argument string and add | ||
761 | * subvolid=0 to make sure we get the actual tree root for path walking to the | ||
762 | * subvol we want. | ||
763 | */ | ||
764 | static char *setup_root_args(char *args) | ||
765 | { | ||
766 | unsigned copied = 0; | ||
767 | unsigned len = strlen(args) + 2; | ||
768 | char *pos; | ||
769 | char *ret; | ||
770 | |||
771 | /* | ||
772 | * We need the same args as before, but minus | ||
773 | * | ||
774 | * subvol=a | ||
775 | * | ||
776 | * and add | ||
777 | * | ||
778 | * subvolid=0 | ||
779 | * | ||
780 | * which is a difference of 2 characters, so we allocate strlen(args) + | ||
781 | * 2 characters. | ||
782 | */ | ||
783 | ret = kzalloc(len * sizeof(char), GFP_NOFS); | ||
784 | if (!ret) | ||
785 | return NULL; | ||
786 | pos = strstr(args, "subvol="); | ||
787 | |||
788 | /* This shouldn't happen, but just in case.. */ | ||
789 | if (!pos) { | ||
790 | kfree(ret); | ||
791 | return NULL; | ||
792 | } | ||
793 | |||
794 | /* | ||
795 | * The subvol=<> arg is not at the front of the string, copy everybody | ||
796 | * up to that into ret. | ||
797 | */ | ||
798 | if (pos != args) { | ||
799 | *pos = '\0'; | ||
800 | strcpy(ret, args); | ||
801 | copied += strlen(args); | ||
802 | pos++; | ||
803 | } | ||
804 | |||
805 | strncpy(ret + copied, "subvolid=0", len - copied); | ||
806 | |||
807 | /* Length of subvolid=0 */ | ||
808 | copied += 10; | ||
809 | |||
810 | /* | ||
811 | * If there is no , after the subvol= option then we know there's no | ||
812 | * other options and we can just return. | ||
813 | */ | ||
814 | pos = strchr(pos, ','); | ||
815 | if (!pos) | ||
816 | return ret; | ||
817 | |||
818 | /* Copy the rest of the arguments into our buffer */ | ||
819 | strncpy(ret + copied, pos, len - copied); | ||
820 | copied += strlen(pos); | ||
821 | |||
822 | return ret; | ||
823 | } | ||
824 | |||
825 | static struct dentry *mount_subvol(const char *subvol_name, int flags, | ||
826 | const char *device_name, char *data) | ||
827 | { | ||
828 | struct dentry *root; | ||
829 | struct vfsmount *mnt; | ||
830 | char *newargs; | ||
831 | |||
832 | newargs = setup_root_args(data); | ||
833 | if (!newargs) | ||
834 | return ERR_PTR(-ENOMEM); | ||
835 | mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, | ||
836 | newargs); | ||
837 | kfree(newargs); | ||
838 | if (IS_ERR(mnt)) | ||
839 | return ERR_CAST(mnt); | ||
840 | |||
841 | root = mount_subtree(mnt, subvol_name); | ||
842 | |||
843 | if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { | ||
844 | struct super_block *s = root->d_sb; | ||
845 | dput(root); | ||
846 | root = ERR_PTR(-EINVAL); | ||
847 | deactivate_locked_super(s); | ||
848 | printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", | ||
849 | subvol_name); | ||
850 | } | ||
851 | |||
852 | return root; | ||
853 | } | ||
756 | 854 | ||
757 | /* | 855 | /* |
758 | * Find a superblock for the given device / mount point. | 856 | * Find a superblock for the given device / mount point. |
@@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
767 | struct super_block *s; | 865 | struct super_block *s; |
768 | struct dentry *root; | 866 | struct dentry *root; |
769 | struct btrfs_fs_devices *fs_devices = NULL; | 867 | struct btrfs_fs_devices *fs_devices = NULL; |
770 | struct btrfs_root *tree_root = NULL; | ||
771 | struct btrfs_fs_info *fs_info = NULL; | 868 | struct btrfs_fs_info *fs_info = NULL; |
772 | fmode_t mode = FMODE_READ; | 869 | fmode_t mode = FMODE_READ; |
773 | char *subvol_name = NULL; | 870 | char *subvol_name = NULL; |
@@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
781 | error = btrfs_parse_early_options(data, mode, fs_type, | 878 | error = btrfs_parse_early_options(data, mode, fs_type, |
782 | &subvol_name, &subvol_objectid, | 879 | &subvol_name, &subvol_objectid, |
783 | &subvol_rootid, &fs_devices); | 880 | &subvol_rootid, &fs_devices); |
784 | if (error) | 881 | if (error) { |
882 | kfree(subvol_name); | ||
785 | return ERR_PTR(error); | 883 | return ERR_PTR(error); |
884 | } | ||
786 | 885 | ||
787 | error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); | 886 | if (subvol_name) { |
788 | if (error) | 887 | root = mount_subvol(subvol_name, flags, device_name, data); |
789 | goto error_free_subvol_name; | 888 | kfree(subvol_name); |
889 | return root; | ||
890 | } | ||
790 | 891 | ||
791 | error = btrfs_open_devices(fs_devices, mode, fs_type); | 892 | error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); |
792 | if (error) | 893 | if (error) |
793 | goto error_free_subvol_name; | 894 | return ERR_PTR(error); |
794 | |||
795 | if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { | ||
796 | error = -EACCES; | ||
797 | goto error_close_devices; | ||
798 | } | ||
799 | 895 | ||
800 | /* | 896 | /* |
801 | * Setup a dummy root and fs_info for test/set super. This is because | 897 | * Setup a dummy root and fs_info for test/set super. This is because |
@@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
804 | * then open_ctree will properly initialize everything later. | 900 | * then open_ctree will properly initialize everything later. |
805 | */ | 901 | */ |
806 | fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); | 902 | fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); |
807 | tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | 903 | if (!fs_info) |
808 | if (!fs_info || !tree_root) { | 904 | return ERR_PTR(-ENOMEM); |
905 | |||
906 | fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); | ||
907 | if (!fs_info->tree_root) { | ||
809 | error = -ENOMEM; | 908 | error = -ENOMEM; |
810 | goto error_close_devices; | 909 | goto error_fs_info; |
811 | } | 910 | } |
812 | fs_info->tree_root = tree_root; | 911 | fs_info->tree_root->fs_info = fs_info; |
813 | fs_info->fs_devices = fs_devices; | 912 | fs_info->fs_devices = fs_devices; |
814 | tree_root->fs_info = fs_info; | 913 | |
914 | fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); | ||
915 | fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); | ||
916 | if (!fs_info->super_copy || !fs_info->super_for_commit) { | ||
917 | error = -ENOMEM; | ||
918 | goto error_fs_info; | ||
919 | } | ||
920 | |||
921 | error = btrfs_open_devices(fs_devices, mode, fs_type); | ||
922 | if (error) | ||
923 | goto error_fs_info; | ||
924 | |||
925 | if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { | ||
926 | error = -EACCES; | ||
927 | goto error_close_devices; | ||
928 | } | ||
815 | 929 | ||
816 | bdev = fs_devices->latest_bdev; | 930 | bdev = fs_devices->latest_bdev; |
817 | s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); | 931 | s = sget(fs_type, btrfs_test_super, btrfs_set_super, |
818 | if (IS_ERR(s)) | 932 | fs_info->tree_root); |
819 | goto error_s; | 933 | if (IS_ERR(s)) { |
934 | error = PTR_ERR(s); | ||
935 | goto error_close_devices; | ||
936 | } | ||
820 | 937 | ||
821 | if (s->s_root) { | 938 | if (s->s_root) { |
822 | if ((flags ^ s->s_flags) & MS_RDONLY) { | 939 | if ((flags ^ s->s_flags) & MS_RDONLY) { |
@@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, | |||
826 | } | 943 | } |
827 | 944 | ||
828 | btrfs_close_devices(fs_devices); | 945 | btrfs_close_devices(fs_devices); |
829 | kfree(fs_info); | 946 | free_fs_info(fs_info); |
830 | kfree(tree_root); | ||
831 | } else { | 947 | } else { |
832 | char b[BDEVNAME_SIZE]; | 948 | char b[BDEVNAME_SIZE]; |
833 | 949 | ||
834 | s->s_flags = flags | MS_NOSEC; | 950 | s->s_flags = flags | MS_NOSEC; |
835 | strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); | 951 | strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); |
952 | btrfs_sb(s)->fs_info->bdev_holder = fs_type; | ||
836 | error = btrfs_fill_super(s, fs_devices, data, | 953 | error = btrfs_fill_super(s, fs_devices, data, |
837 | flags & MS_SILENT ? 1 : 0); | 954 | flags & MS_SILENT ? 1 : 0); |
838 | if (error) { | 955 | if (error) { |
839 | deactivate_locked_super(s); | 956 | deactivate_locked_super(s); |
840 | goto error_free_subvol_name; | 957 | return ERR_PTR(error); |
841 | } | 958 | } |
842 | 959 | ||
843 | btrfs_sb(s)->fs_info->bdev_holder = fs_type; | ||
844 | s->s_flags |= MS_ACTIVE; | 960 | s->s_flags |= MS_ACTIVE; |
845 | } | 961 | } |
846 | 962 | ||
847 | /* if they gave us a subvolume name bind mount into that */ | 963 | root = get_default_root(s, subvol_objectid); |
848 | if (strcmp(subvol_name, ".")) { | 964 | if (IS_ERR(root)) { |
849 | struct dentry *new_root; | 965 | deactivate_locked_super(s); |
850 | 966 | return root; | |
851 | root = get_default_root(s, subvol_rootid); | ||
852 | if (IS_ERR(root)) { | ||
853 | error = PTR_ERR(root); | ||
854 | deactivate_locked_super(s); | ||
855 | goto error_free_subvol_name; | ||
856 | } | ||
857 | |||
858 | mutex_lock(&root->d_inode->i_mutex); | ||
859 | new_root = lookup_one_len(subvol_name, root, | ||
860 | strlen(subvol_name)); | ||
861 | mutex_unlock(&root->d_inode->i_mutex); | ||
862 | |||
863 | if (IS_ERR(new_root)) { | ||
864 | dput(root); | ||
865 | deactivate_locked_super(s); | ||
866 | error = PTR_ERR(new_root); | ||
867 | goto error_free_subvol_name; | ||
868 | } | ||
869 | if (!new_root->d_inode) { | ||
870 | dput(root); | ||
871 | dput(new_root); | ||
872 | deactivate_locked_super(s); | ||
873 | error = -ENXIO; | ||
874 | goto error_free_subvol_name; | ||
875 | } | ||
876 | dput(root); | ||
877 | root = new_root; | ||
878 | } else { | ||
879 | root = get_default_root(s, subvol_objectid); | ||
880 | if (IS_ERR(root)) { | ||
881 | error = PTR_ERR(root); | ||
882 | deactivate_locked_super(s); | ||
883 | goto error_free_subvol_name; | ||
884 | } | ||
885 | } | 967 | } |
886 | 968 | ||
887 | kfree(subvol_name); | ||
888 | return root; | 969 | return root; |
889 | 970 | ||
890 | error_s: | ||
891 | error = PTR_ERR(s); | ||
892 | error_close_devices: | 971 | error_close_devices: |
893 | btrfs_close_devices(fs_devices); | 972 | btrfs_close_devices(fs_devices); |
894 | kfree(fs_info); | 973 | error_fs_info: |
895 | kfree(tree_root); | 974 | free_fs_info(fs_info); |
896 | error_free_subvol_name: | ||
897 | kfree(subvol_name); | ||
898 | return ERR_PTR(error); | 975 | return ERR_PTR(error); |
899 | } | 976 | } |
900 | 977 | ||
@@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
919 | if (root->fs_info->fs_devices->rw_devices == 0) | 996 | if (root->fs_info->fs_devices->rw_devices == 0) |
920 | return -EACCES; | 997 | return -EACCES; |
921 | 998 | ||
922 | if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) | 999 | if (btrfs_super_log_root(root->fs_info->super_copy) != 0) |
923 | return -EINVAL; | 1000 | return -EINVAL; |
924 | 1001 | ||
925 | ret = btrfs_cleanup_fs_roots(root->fs_info); | 1002 | ret = btrfs_cleanup_fs_roots(root->fs_info); |
@@ -980,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
980 | int i = 0, nr_devices; | 1057 | int i = 0, nr_devices; |
981 | int ret; | 1058 | int ret; |
982 | 1059 | ||
983 | nr_devices = fs_info->fs_devices->rw_devices; | 1060 | nr_devices = fs_info->fs_devices->open_devices; |
984 | BUG_ON(!nr_devices); | 1061 | BUG_ON(!nr_devices); |
985 | 1062 | ||
986 | devices_info = kmalloc(sizeof(*devices_info) * nr_devices, | 1063 | devices_info = kmalloc(sizeof(*devices_info) * nr_devices, |
@@ -1002,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1002 | else | 1079 | else |
1003 | min_stripe_size = BTRFS_STRIPE_LEN; | 1080 | min_stripe_size = BTRFS_STRIPE_LEN; |
1004 | 1081 | ||
1005 | list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { | 1082 | list_for_each_entry(device, &fs_devices->devices, dev_list) { |
1006 | if (!device->in_fs_metadata) | 1083 | if (!device->in_fs_metadata || !device->bdev) |
1007 | continue; | 1084 | continue; |
1008 | 1085 | ||
1009 | avail_space = device->total_bytes - device->bytes_used; | 1086 | avail_space = device->total_bytes - device->bytes_used; |
@@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1085 | static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | 1162 | static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
1086 | { | 1163 | { |
1087 | struct btrfs_root *root = btrfs_sb(dentry->d_sb); | 1164 | struct btrfs_root *root = btrfs_sb(dentry->d_sb); |
1088 | struct btrfs_super_block *disk_super = &root->fs_info->super_copy; | 1165 | struct btrfs_super_block *disk_super = root->fs_info->super_copy; |
1089 | struct list_head *head = &root->fs_info->space_info; | 1166 | struct list_head *head = &root->fs_info->space_info; |
1090 | struct btrfs_space_info *found; | 1167 | struct btrfs_space_info *found; |
1091 | u64 total_used = 0; | 1168 | u64 total_used = 0; |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e24b7964a15..81376d94cd3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) | |||
55 | struct btrfs_transaction *cur_trans; | 55 | struct btrfs_transaction *cur_trans; |
56 | 56 | ||
57 | spin_lock(&root->fs_info->trans_lock); | 57 | spin_lock(&root->fs_info->trans_lock); |
58 | loop: | ||
58 | if (root->fs_info->trans_no_join) { | 59 | if (root->fs_info->trans_no_join) { |
59 | if (!nofail) { | 60 | if (!nofail) { |
60 | spin_unlock(&root->fs_info->trans_lock); | 61 | spin_unlock(&root->fs_info->trans_lock); |
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) | |||
75 | cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); | 76 | cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); |
76 | if (!cur_trans) | 77 | if (!cur_trans) |
77 | return -ENOMEM; | 78 | return -ENOMEM; |
79 | |||
78 | spin_lock(&root->fs_info->trans_lock); | 80 | spin_lock(&root->fs_info->trans_lock); |
79 | if (root->fs_info->running_transaction) { | 81 | if (root->fs_info->running_transaction) { |
82 | /* | ||
83 | * someone started a transaction after we unlocked. Make sure | ||
84 | * to redo the trans_no_join checks above | ||
85 | */ | ||
80 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); | 86 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); |
81 | cur_trans = root->fs_info->running_transaction; | 87 | cur_trans = root->fs_info->running_transaction; |
82 | atomic_inc(&cur_trans->use_count); | 88 | goto loop; |
83 | atomic_inc(&cur_trans->num_writers); | ||
84 | cur_trans->num_joined++; | ||
85 | spin_unlock(&root->fs_info->trans_lock); | ||
86 | return 0; | ||
87 | } | 89 | } |
90 | |||
88 | atomic_set(&cur_trans->num_writers, 1); | 91 | atomic_set(&cur_trans->num_writers, 1); |
89 | cur_trans->num_joined = 0; | 92 | cur_trans->num_joined = 0; |
90 | init_waitqueue_head(&cur_trans->writer_wait); | 93 | init_waitqueue_head(&cur_trans->writer_wait); |
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, | |||
275 | */ | 278 | */ |
276 | if (num_items > 0 && root != root->fs_info->chunk_root) { | 279 | if (num_items > 0 && root != root->fs_info->chunk_root) { |
277 | num_bytes = btrfs_calc_trans_metadata_size(root, num_items); | 280 | num_bytes = btrfs_calc_trans_metadata_size(root, num_items); |
278 | ret = btrfs_block_rsv_add(NULL, root, | 281 | ret = btrfs_block_rsv_add(root, |
279 | &root->fs_info->trans_block_rsv, | 282 | &root->fs_info->trans_block_rsv, |
280 | num_bytes); | 283 | num_bytes); |
281 | if (ret) | 284 | if (ret) |
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans, | |||
418 | struct btrfs_root *root) | 421 | struct btrfs_root *root) |
419 | { | 422 | { |
420 | int ret; | 423 | int ret; |
421 | ret = btrfs_block_rsv_check(trans, root, | 424 | |
422 | &root->fs_info->global_block_rsv, 0, 5); | 425 | ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); |
423 | return ret ? 1 : 0; | 426 | return ret ? 1 : 0; |
424 | } | 427 | } |
425 | 428 | ||
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, | |||
427 | struct btrfs_root *root) | 430 | struct btrfs_root *root) |
428 | { | 431 | { |
429 | struct btrfs_transaction *cur_trans = trans->transaction; | 432 | struct btrfs_transaction *cur_trans = trans->transaction; |
433 | struct btrfs_block_rsv *rsv = trans->block_rsv; | ||
430 | int updates; | 434 | int updates; |
431 | 435 | ||
432 | smp_mb(); | 436 | smp_mb(); |
433 | if (cur_trans->blocked || cur_trans->delayed_refs.flushing) | 437 | if (cur_trans->blocked || cur_trans->delayed_refs.flushing) |
434 | return 1; | 438 | return 1; |
435 | 439 | ||
440 | /* | ||
441 | * We need to do this in case we're deleting csums so the global block | ||
442 | * rsv get's used instead of the csum block rsv. | ||
443 | */ | ||
444 | trans->block_rsv = NULL; | ||
445 | |||
436 | updates = trans->delayed_ref_updates; | 446 | updates = trans->delayed_ref_updates; |
437 | trans->delayed_ref_updates = 0; | 447 | trans->delayed_ref_updates = 0; |
438 | if (updates) | 448 | if (updates) |
439 | btrfs_run_delayed_refs(trans, root, updates); | 449 | btrfs_run_delayed_refs(trans, root, updates); |
440 | 450 | ||
451 | trans->block_rsv = rsv; | ||
452 | |||
441 | return should_end_transaction(trans, root); | 453 | return should_end_transaction(trans, root); |
442 | } | 454 | } |
443 | 455 | ||
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
453 | return 0; | 465 | return 0; |
454 | } | 466 | } |
455 | 467 | ||
468 | btrfs_trans_release_metadata(trans, root); | ||
469 | trans->block_rsv = NULL; | ||
456 | while (count < 4) { | 470 | while (count < 4) { |
457 | unsigned long cur = trans->delayed_ref_updates; | 471 | unsigned long cur = trans->delayed_ref_updates; |
458 | trans->delayed_ref_updates = 0; | 472 | trans->delayed_ref_updates = 0; |
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
473 | count++; | 487 | count++; |
474 | } | 488 | } |
475 | 489 | ||
476 | btrfs_trans_release_metadata(trans, root); | ||
477 | |||
478 | if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && | 490 | if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && |
479 | should_end_transaction(trans, root)) { | 491 | should_end_transaction(trans, root)) { |
480 | trans->transaction->blocked = 1; | 492 | trans->transaction->blocked = 1; |
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans, | |||
562 | int btrfs_write_marked_extents(struct btrfs_root *root, | 574 | int btrfs_write_marked_extents(struct btrfs_root *root, |
563 | struct extent_io_tree *dirty_pages, int mark) | 575 | struct extent_io_tree *dirty_pages, int mark) |
564 | { | 576 | { |
565 | int ret; | ||
566 | int err = 0; | 577 | int err = 0; |
567 | int werr = 0; | 578 | int werr = 0; |
568 | struct page *page; | 579 | struct address_space *mapping = root->fs_info->btree_inode->i_mapping; |
569 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
570 | u64 start = 0; | 580 | u64 start = 0; |
571 | u64 end; | 581 | u64 end; |
572 | unsigned long index; | ||
573 | |||
574 | while (1) { | ||
575 | ret = find_first_extent_bit(dirty_pages, start, &start, &end, | ||
576 | mark); | ||
577 | if (ret) | ||
578 | break; | ||
579 | while (start <= end) { | ||
580 | cond_resched(); | ||
581 | |||
582 | index = start >> PAGE_CACHE_SHIFT; | ||
583 | start = (u64)(index + 1) << PAGE_CACHE_SHIFT; | ||
584 | page = find_get_page(btree_inode->i_mapping, index); | ||
585 | if (!page) | ||
586 | continue; | ||
587 | |||
588 | btree_lock_page_hook(page); | ||
589 | if (!page->mapping) { | ||
590 | unlock_page(page); | ||
591 | page_cache_release(page); | ||
592 | continue; | ||
593 | } | ||
594 | 582 | ||
595 | if (PageWriteback(page)) { | 583 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
596 | if (PageDirty(page)) | 584 | mark)) { |
597 | wait_on_page_writeback(page); | 585 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, |
598 | else { | 586 | GFP_NOFS); |
599 | unlock_page(page); | 587 | err = filemap_fdatawrite_range(mapping, start, end); |
600 | page_cache_release(page); | 588 | if (err) |
601 | continue; | 589 | werr = err; |
602 | } | 590 | cond_resched(); |
603 | } | 591 | start = end + 1; |
604 | err = write_one_page(page, 0); | ||
605 | if (err) | ||
606 | werr = err; | ||
607 | page_cache_release(page); | ||
608 | } | ||
609 | } | 592 | } |
610 | if (err) | 593 | if (err) |
611 | werr = err; | 594 | werr = err; |
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
621 | int btrfs_wait_marked_extents(struct btrfs_root *root, | 604 | int btrfs_wait_marked_extents(struct btrfs_root *root, |
622 | struct extent_io_tree *dirty_pages, int mark) | 605 | struct extent_io_tree *dirty_pages, int mark) |
623 | { | 606 | { |
624 | int ret; | ||
625 | int err = 0; | 607 | int err = 0; |
626 | int werr = 0; | 608 | int werr = 0; |
627 | struct page *page; | 609 | struct address_space *mapping = root->fs_info->btree_inode->i_mapping; |
628 | struct inode *btree_inode = root->fs_info->btree_inode; | ||
629 | u64 start = 0; | 610 | u64 start = 0; |
630 | u64 end; | 611 | u64 end; |
631 | unsigned long index; | ||
632 | 612 | ||
633 | while (1) { | 613 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
634 | ret = find_first_extent_bit(dirty_pages, start, &start, &end, | 614 | EXTENT_NEED_WAIT)) { |
635 | mark); | 615 | clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS); |
636 | if (ret) | 616 | err = filemap_fdatawait_range(mapping, start, end); |
637 | break; | 617 | if (err) |
638 | 618 | werr = err; | |
639 | clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); | 619 | cond_resched(); |
640 | while (start <= end) { | 620 | start = end + 1; |
641 | index = start >> PAGE_CACHE_SHIFT; | ||
642 | start = (u64)(index + 1) << PAGE_CACHE_SHIFT; | ||
643 | page = find_get_page(btree_inode->i_mapping, index); | ||
644 | if (!page) | ||
645 | continue; | ||
646 | if (PageDirty(page)) { | ||
647 | btree_lock_page_hook(page); | ||
648 | wait_on_page_writeback(page); | ||
649 | err = write_one_page(page, 0); | ||
650 | if (err) | ||
651 | werr = err; | ||
652 | } | ||
653 | wait_on_page_writeback(page); | ||
654 | page_cache_release(page); | ||
655 | cond_resched(); | ||
656 | } | ||
657 | } | 621 | } |
658 | if (err) | 622 | if (err) |
659 | werr = err; | 623 | werr = err; |
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | |||
673 | 637 | ||
674 | ret = btrfs_write_marked_extents(root, dirty_pages, mark); | 638 | ret = btrfs_write_marked_extents(root, dirty_pages, mark); |
675 | ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); | 639 | ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); |
676 | return ret || ret2; | 640 | |
641 | if (ret) | ||
642 | return ret; | ||
643 | if (ret2) | ||
644 | return ret2; | ||
645 | return 0; | ||
677 | } | 646 | } |
678 | 647 | ||
679 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | 648 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, |
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, | |||
816 | 785 | ||
817 | btrfs_save_ino_cache(root, trans); | 786 | btrfs_save_ino_cache(root, trans); |
818 | 787 | ||
788 | /* see comments in should_cow_block() */ | ||
789 | root->force_cow = 0; | ||
790 | smp_wmb(); | ||
791 | |||
819 | if (root->commit_root != root->node) { | 792 | if (root->commit_root != root->node) { |
820 | mutex_lock(&root->fs_commit_mutex); | 793 | mutex_lock(&root->fs_commit_mutex); |
821 | switch_commit_root(root); | 794 | switch_commit_root(root); |
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
911 | } | 884 | } |
912 | 885 | ||
913 | btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); | 886 | btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); |
914 | btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); | ||
915 | 887 | ||
916 | if (to_reserve > 0) { | 888 | if (to_reserve > 0) { |
917 | ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, | 889 | ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, |
918 | to_reserve); | 890 | to_reserve); |
919 | if (ret) { | 891 | if (ret) { |
920 | pending->error = ret; | 892 | pending->error = ret; |
921 | goto fail; | 893 | goto fail; |
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
979 | btrfs_tree_unlock(old); | 951 | btrfs_tree_unlock(old); |
980 | free_extent_buffer(old); | 952 | free_extent_buffer(old); |
981 | 953 | ||
954 | /* see comments in should_cow_block() */ | ||
955 | root->force_cow = 1; | ||
956 | smp_wmb(); | ||
957 | |||
982 | btrfs_set_root_node(new_root_item, tmp); | 958 | btrfs_set_root_node(new_root_item, tmp); |
983 | /* record when the snapshot was created in key.offset */ | 959 | /* record when the snapshot was created in key.offset */ |
984 | key.offset = trans->transid; | 960 | key.offset = trans->transid; |
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
1002 | BUG_ON(IS_ERR(pending->snap)); | 978 | BUG_ON(IS_ERR(pending->snap)); |
1003 | 979 | ||
1004 | btrfs_reloc_post_snapshot(trans, pending); | 980 | btrfs_reloc_post_snapshot(trans, pending); |
1005 | btrfs_orphan_post_snapshot(trans, pending); | ||
1006 | fail: | 981 | fail: |
1007 | kfree(new_root_item); | 982 | kfree(new_root_item); |
1008 | trans->block_rsv = rsv; | 983 | trans->block_rsv = rsv; |
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root) | |||
1032 | struct btrfs_root_item *root_item; | 1007 | struct btrfs_root_item *root_item; |
1033 | struct btrfs_super_block *super; | 1008 | struct btrfs_super_block *super; |
1034 | 1009 | ||
1035 | super = &root->fs_info->super_copy; | 1010 | super = root->fs_info->super_copy; |
1036 | 1011 | ||
1037 | root_item = &root->fs_info->chunk_root->root_item; | 1012 | root_item = &root->fs_info->chunk_root->root_item; |
1038 | super->chunk_root = root_item->bytenr; | 1013 | super->chunk_root = root_item->bytenr; |
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root) | |||
1043 | super->root = root_item->bytenr; | 1018 | super->root = root_item->bytenr; |
1044 | super->generation = root_item->generation; | 1019 | super->generation = root_item->generation; |
1045 | super->root_level = root_item->level; | 1020 | super->root_level = root_item->level; |
1046 | if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) | 1021 | if (btrfs_test_opt(root, SPACE_CACHE)) |
1047 | super->cache_generation = root_item->generation; | 1022 | super->cache_generation = root_item->generation; |
1048 | } | 1023 | } |
1049 | 1024 | ||
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1168 | 1143 | ||
1169 | btrfs_run_ordered_operations(root, 0); | 1144 | btrfs_run_ordered_operations(root, 0); |
1170 | 1145 | ||
1146 | btrfs_trans_release_metadata(trans, root); | ||
1147 | trans->block_rsv = NULL; | ||
1148 | |||
1171 | /* make a pass through all the delayed refs we have so far | 1149 | /* make a pass through all the delayed refs we have so far |
1172 | * any runnings procs may add more while we are here | 1150 | * any runnings procs may add more while we are here |
1173 | */ | 1151 | */ |
1174 | ret = btrfs_run_delayed_refs(trans, root, 0); | 1152 | ret = btrfs_run_delayed_refs(trans, root, 0); |
1175 | BUG_ON(ret); | 1153 | BUG_ON(ret); |
1176 | 1154 | ||
1177 | btrfs_trans_release_metadata(trans, root); | ||
1178 | |||
1179 | cur_trans = trans->transaction; | 1155 | cur_trans = trans->transaction; |
1180 | /* | 1156 | /* |
1181 | * set the flushing flag so procs in this transaction have to | 1157 | * set the flushing flag so procs in this transaction have to |
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1341 | update_super_roots(root); | 1317 | update_super_roots(root); |
1342 | 1318 | ||
1343 | if (!root->fs_info->log_root_recovering) { | 1319 | if (!root->fs_info->log_root_recovering) { |
1344 | btrfs_set_super_log_root(&root->fs_info->super_copy, 0); | 1320 | btrfs_set_super_log_root(root->fs_info->super_copy, 0); |
1345 | btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); | 1321 | btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); |
1346 | } | 1322 | } |
1347 | 1323 | ||
1348 | memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, | 1324 | memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, |
1349 | sizeof(root->fs_info->super_copy)); | 1325 | sizeof(*root->fs_info->super_copy)); |
1350 | 1326 | ||
1351 | trans->transaction->blocked = 0; | 1327 | trans->transaction->blocked = 0; |
1352 | spin_lock(&root->fs_info->trans_lock); | 1328 | spin_lock(&root->fs_info->trans_lock); |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0618aa39740..3568374d419 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log, | |||
276 | struct walk_control *wc, u64 gen) | 276 | struct walk_control *wc, u64 gen) |
277 | { | 277 | { |
278 | if (wc->pin) | 278 | if (wc->pin) |
279 | btrfs_pin_extent(log->fs_info->extent_root, | 279 | btrfs_pin_extent_for_log_replay(wc->trans, |
280 | eb->start, eb->len, 0); | 280 | log->fs_info->extent_root, |
281 | eb->start, eb->len); | ||
281 | 282 | ||
282 | if (btrfs_buffer_uptodate(eb, gen)) { | 283 | if (btrfs_buffer_uptodate(eb, gen)) { |
283 | if (wc->write) | 284 | if (wc->write) |
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, | |||
1760 | 1761 | ||
1761 | WARN_ON(root_owner != | 1762 | WARN_ON(root_owner != |
1762 | BTRFS_TREE_LOG_OBJECTID); | 1763 | BTRFS_TREE_LOG_OBJECTID); |
1763 | ret = btrfs_free_reserved_extent(root, | 1764 | ret = btrfs_free_and_pin_reserved_extent(root, |
1764 | bytenr, blocksize); | 1765 | bytenr, blocksize); |
1765 | BUG_ON(ret); | 1766 | BUG_ON(ret); |
1766 | } | 1767 | } |
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, | |||
1828 | btrfs_tree_unlock(next); | 1829 | btrfs_tree_unlock(next); |
1829 | 1830 | ||
1830 | WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); | 1831 | WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); |
1831 | ret = btrfs_free_reserved_extent(root, | 1832 | ret = btrfs_free_and_pin_reserved_extent(root, |
1832 | path->nodes[*level]->start, | 1833 | path->nodes[*level]->start, |
1833 | path->nodes[*level]->len); | 1834 | path->nodes[*level]->len); |
1834 | BUG_ON(ret); | 1835 | BUG_ON(ret); |
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, | |||
1897 | 1898 | ||
1898 | WARN_ON(log->root_key.objectid != | 1899 | WARN_ON(log->root_key.objectid != |
1899 | BTRFS_TREE_LOG_OBJECTID); | 1900 | BTRFS_TREE_LOG_OBJECTID); |
1900 | ret = btrfs_free_reserved_extent(log, next->start, | 1901 | ret = btrfs_free_and_pin_reserved_extent(log, next->start, |
1901 | next->len); | 1902 | next->len); |
1902 | BUG_ON(ret); | 1903 | BUG_ON(ret); |
1903 | } | 1904 | } |
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2013 | /* wait for previous tree log sync to complete */ | 2014 | /* wait for previous tree log sync to complete */ |
2014 | if (atomic_read(&root->log_commit[(index1 + 1) % 2])) | 2015 | if (atomic_read(&root->log_commit[(index1 + 1) % 2])) |
2015 | wait_log_commit(trans, root, root->log_transid - 1); | 2016 | wait_log_commit(trans, root, root->log_transid - 1); |
2016 | |||
2017 | while (1) { | 2017 | while (1) { |
2018 | unsigned long batch = root->log_batch; | 2018 | unsigned long batch = root->log_batch; |
2019 | if (root->log_multiple_pids) { | 2019 | /* when we're on an ssd, just kick the log commit out */ |
2020 | if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) { | ||
2020 | mutex_unlock(&root->log_mutex); | 2021 | mutex_unlock(&root->log_mutex); |
2021 | schedule_timeout_uninterruptible(1); | 2022 | schedule_timeout_uninterruptible(1); |
2022 | mutex_lock(&root->log_mutex); | 2023 | mutex_lock(&root->log_mutex); |
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2117 | BUG_ON(ret); | 2118 | BUG_ON(ret); |
2118 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2119 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2119 | 2120 | ||
2120 | btrfs_set_super_log_root(&root->fs_info->super_for_commit, | 2121 | btrfs_set_super_log_root(root->fs_info->super_for_commit, |
2121 | log_root_tree->node->start); | 2122 | log_root_tree->node->start); |
2122 | btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, | 2123 | btrfs_set_super_log_root_level(root->fs_info->super_for_commit, |
2123 | btrfs_header_level(log_root_tree->node)); | 2124 | btrfs_header_level(log_root_tree->node)); |
2124 | 2125 | ||
2125 | log_root_tree->log_batch = 0; | 2126 | log_root_tree->log_batch = 0; |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f2a4cc79da6..0a8c8f8304b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path, | |||
366 | } | 366 | } |
367 | INIT_LIST_HEAD(&device->dev_alloc_list); | 367 | INIT_LIST_HEAD(&device->dev_alloc_list); |
368 | 368 | ||
369 | /* init readahead state */ | ||
370 | spin_lock_init(&device->reada_lock); | ||
371 | device->reada_curr_zone = NULL; | ||
372 | atomic_set(&device->reada_in_flight, 0); | ||
373 | device->reada_next = 0; | ||
374 | INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT); | ||
375 | INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT); | ||
376 | |||
369 | mutex_lock(&fs_devices->device_list_mutex); | 377 | mutex_lock(&fs_devices->device_list_mutex); |
370 | list_add_rcu(&device->dev_list, &fs_devices->devices); | 378 | list_add_rcu(&device->dev_list, &fs_devices->devices); |
371 | mutex_unlock(&fs_devices->device_list_mutex); | 379 | mutex_unlock(&fs_devices->device_list_mutex); |
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
597 | set_blocksize(bdev, 4096); | 605 | set_blocksize(bdev, 4096); |
598 | 606 | ||
599 | bh = btrfs_read_dev_super(bdev); | 607 | bh = btrfs_read_dev_super(bdev); |
600 | if (!bh) { | 608 | if (!bh) |
601 | ret = -EINVAL; | ||
602 | goto error_close; | 609 | goto error_close; |
603 | } | ||
604 | 610 | ||
605 | disk_super = (struct btrfs_super_block *)bh->b_data; | 611 | disk_super = (struct btrfs_super_block *)bh->b_data; |
606 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 612 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
@@ -655,7 +661,7 @@ error: | |||
655 | continue; | 661 | continue; |
656 | } | 662 | } |
657 | if (fs_devices->open_devices == 0) { | 663 | if (fs_devices->open_devices == 0) { |
658 | ret = -EIO; | 664 | ret = -EINVAL; |
659 | goto out; | 665 | goto out; |
660 | } | 666 | } |
661 | fs_devices->seeding = seeding; | 667 | fs_devices->seeding = seeding; |
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
993 | key.objectid = device->devid; | 999 | key.objectid = device->devid; |
994 | key.offset = start; | 1000 | key.offset = start; |
995 | key.type = BTRFS_DEV_EXTENT_KEY; | 1001 | key.type = BTRFS_DEV_EXTENT_KEY; |
996 | 1002 | again: | |
997 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | 1003 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); |
998 | if (ret > 0) { | 1004 | if (ret > 0) { |
999 | ret = btrfs_previous_item(root, path, key.objectid, | 1005 | ret = btrfs_previous_item(root, path, key.objectid, |
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
1006 | struct btrfs_dev_extent); | 1012 | struct btrfs_dev_extent); |
1007 | BUG_ON(found_key.offset > start || found_key.offset + | 1013 | BUG_ON(found_key.offset > start || found_key.offset + |
1008 | btrfs_dev_extent_length(leaf, extent) < start); | 1014 | btrfs_dev_extent_length(leaf, extent) < start); |
1015 | key = found_key; | ||
1016 | btrfs_release_path(path); | ||
1017 | goto again; | ||
1009 | } else if (ret == 0) { | 1018 | } else if (ret == 0) { |
1010 | leaf = path->nodes[0]; | 1019 | leaf = path->nodes[0]; |
1011 | extent = btrfs_item_ptr(leaf, path->slots[0], | 1020 | extent = btrfs_item_ptr(leaf, path->slots[0], |
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, | |||
1013 | } | 1022 | } |
1014 | BUG_ON(ret); | 1023 | BUG_ON(ret); |
1015 | 1024 | ||
1016 | if (device->bytes_used > 0) | 1025 | if (device->bytes_used > 0) { |
1017 | device->bytes_used -= btrfs_dev_extent_length(leaf, extent); | 1026 | u64 len = btrfs_dev_extent_length(leaf, extent); |
1027 | device->bytes_used -= len; | ||
1028 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1029 | root->fs_info->free_chunk_space += len; | ||
1030 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1031 | } | ||
1018 | ret = btrfs_del_item(trans, root, path); | 1032 | ret = btrfs_del_item(trans, root, path); |
1019 | 1033 | ||
1020 | out: | 1034 | out: |
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1356 | if (ret) | 1370 | if (ret) |
1357 | goto error_undo; | 1371 | goto error_undo; |
1358 | 1372 | ||
1373 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1374 | root->fs_info->free_chunk_space = device->total_bytes - | ||
1375 | device->bytes_used; | ||
1376 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1377 | |||
1359 | device->in_fs_metadata = 0; | 1378 | device->in_fs_metadata = 0; |
1360 | btrfs_scrub_cancel_dev(root, device); | 1379 | btrfs_scrub_cancel_dev(root, device); |
1361 | 1380 | ||
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1387 | call_rcu(&device->rcu, free_device); | 1406 | call_rcu(&device->rcu, free_device); |
1388 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 1407 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); |
1389 | 1408 | ||
1390 | num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; | 1409 | num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; |
1391 | btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); | 1410 | btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); |
1392 | 1411 | ||
1393 | if (cur_devices->open_devices == 0) { | 1412 | if (cur_devices->open_devices == 0) { |
1394 | struct btrfs_fs_devices *fs_devices; | 1413 | struct btrfs_fs_devices *fs_devices; |
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, | |||
1450 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; | 1469 | struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; |
1451 | struct btrfs_fs_devices *old_devices; | 1470 | struct btrfs_fs_devices *old_devices; |
1452 | struct btrfs_fs_devices *seed_devices; | 1471 | struct btrfs_fs_devices *seed_devices; |
1453 | struct btrfs_super_block *disk_super = &root->fs_info->super_copy; | 1472 | struct btrfs_super_block *disk_super = root->fs_info->super_copy; |
1454 | struct btrfs_device *device; | 1473 | struct btrfs_device *device; |
1455 | u64 super_flags; | 1474 | u64 super_flags; |
1456 | 1475 | ||
@@ -1592,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1592 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) | 1611 | if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) |
1593 | return -EINVAL; | 1612 | return -EINVAL; |
1594 | 1613 | ||
1595 | bdev = blkdev_get_by_path(device_path, FMODE_EXCL, | 1614 | bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, |
1596 | root->fs_info->bdev_holder); | 1615 | root->fs_info->bdev_holder); |
1597 | if (IS_ERR(bdev)) | 1616 | if (IS_ERR(bdev)) |
1598 | return PTR_ERR(bdev); | 1617 | return PTR_ERR(bdev); |
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) | |||
1691 | root->fs_info->fs_devices->num_can_discard++; | 1710 | root->fs_info->fs_devices->num_can_discard++; |
1692 | root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; | 1711 | root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; |
1693 | 1712 | ||
1713 | spin_lock(&root->fs_info->free_chunk_lock); | ||
1714 | root->fs_info->free_chunk_space += device->total_bytes; | ||
1715 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
1716 | |||
1694 | if (!blk_queue_nonrot(bdev_get_queue(bdev))) | 1717 | if (!blk_queue_nonrot(bdev_get_queue(bdev))) |
1695 | root->fs_info->fs_devices->rotating = 1; | 1718 | root->fs_info->fs_devices->rotating = 1; |
1696 | 1719 | ||
1697 | total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); | 1720 | total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); |
1698 | btrfs_set_super_total_bytes(&root->fs_info->super_copy, | 1721 | btrfs_set_super_total_bytes(root->fs_info->super_copy, |
1699 | total_bytes + device->total_bytes); | 1722 | total_bytes + device->total_bytes); |
1700 | 1723 | ||
1701 | total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); | 1724 | total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); |
1702 | btrfs_set_super_num_devices(&root->fs_info->super_copy, | 1725 | btrfs_set_super_num_devices(root->fs_info->super_copy, |
1703 | total_bytes + 1); | 1726 | total_bytes + 1); |
1704 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 1727 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); |
1705 | 1728 | ||
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, | |||
1790 | struct btrfs_device *device, u64 new_size) | 1813 | struct btrfs_device *device, u64 new_size) |
1791 | { | 1814 | { |
1792 | struct btrfs_super_block *super_copy = | 1815 | struct btrfs_super_block *super_copy = |
1793 | &device->dev_root->fs_info->super_copy; | 1816 | device->dev_root->fs_info->super_copy; |
1794 | u64 old_total = btrfs_super_total_bytes(super_copy); | 1817 | u64 old_total = btrfs_super_total_bytes(super_copy); |
1795 | u64 diff = new_size - device->total_bytes; | 1818 | u64 diff = new_size - device->total_bytes; |
1796 | 1819 | ||
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, | |||
1849 | static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 | 1872 | static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 |
1850 | chunk_offset) | 1873 | chunk_offset) |
1851 | { | 1874 | { |
1852 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | 1875 | struct btrfs_super_block *super_copy = root->fs_info->super_copy; |
1853 | struct btrfs_disk_key *disk_key; | 1876 | struct btrfs_disk_key *disk_key; |
1854 | struct btrfs_chunk *chunk; | 1877 | struct btrfs_chunk *chunk; |
1855 | u8 *ptr; | 1878 | u8 *ptr; |
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) | |||
2175 | bool retried = false; | 2198 | bool retried = false; |
2176 | struct extent_buffer *l; | 2199 | struct extent_buffer *l; |
2177 | struct btrfs_key key; | 2200 | struct btrfs_key key; |
2178 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | 2201 | struct btrfs_super_block *super_copy = root->fs_info->super_copy; |
2179 | u64 old_total = btrfs_super_total_bytes(super_copy); | 2202 | u64 old_total = btrfs_super_total_bytes(super_copy); |
2180 | u64 old_size = device->total_bytes; | 2203 | u64 old_size = device->total_bytes; |
2181 | u64 diff = device->total_bytes - new_size; | 2204 | u64 diff = device->total_bytes - new_size; |
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) | |||
2192 | lock_chunks(root); | 2215 | lock_chunks(root); |
2193 | 2216 | ||
2194 | device->total_bytes = new_size; | 2217 | device->total_bytes = new_size; |
2195 | if (device->writeable) | 2218 | if (device->writeable) { |
2196 | device->fs_devices->total_rw_bytes -= diff; | 2219 | device->fs_devices->total_rw_bytes -= diff; |
2220 | spin_lock(&root->fs_info->free_chunk_lock); | ||
2221 | root->fs_info->free_chunk_space -= diff; | ||
2222 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
2223 | } | ||
2197 | unlock_chunks(root); | 2224 | unlock_chunks(root); |
2198 | 2225 | ||
2199 | again: | 2226 | again: |
@@ -2257,6 +2284,9 @@ again: | |||
2257 | device->total_bytes = old_size; | 2284 | device->total_bytes = old_size; |
2258 | if (device->writeable) | 2285 | if (device->writeable) |
2259 | device->fs_devices->total_rw_bytes += diff; | 2286 | device->fs_devices->total_rw_bytes += diff; |
2287 | spin_lock(&root->fs_info->free_chunk_lock); | ||
2288 | root->fs_info->free_chunk_space += diff; | ||
2289 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
2260 | unlock_chunks(root); | 2290 | unlock_chunks(root); |
2261 | goto done; | 2291 | goto done; |
2262 | } | 2292 | } |
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, | |||
2292 | struct btrfs_key *key, | 2322 | struct btrfs_key *key, |
2293 | struct btrfs_chunk *chunk, int item_size) | 2323 | struct btrfs_chunk *chunk, int item_size) |
2294 | { | 2324 | { |
2295 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | 2325 | struct btrfs_super_block *super_copy = root->fs_info->super_copy; |
2296 | struct btrfs_disk_key disk_key; | 2326 | struct btrfs_disk_key disk_key; |
2297 | u32 array_size; | 2327 | u32 array_size; |
2298 | u8 *ptr; | 2328 | u8 *ptr; |
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, | |||
2615 | index++; | 2645 | index++; |
2616 | } | 2646 | } |
2617 | 2647 | ||
2648 | spin_lock(&extent_root->fs_info->free_chunk_lock); | ||
2649 | extent_root->fs_info->free_chunk_space -= (stripe_size * | ||
2650 | map->num_stripes); | ||
2651 | spin_unlock(&extent_root->fs_info->free_chunk_lock); | ||
2652 | |||
2618 | index = 0; | 2653 | index = 0; |
2619 | stripe = &chunk->stripe; | 2654 | stripe = &chunk->stripe; |
2620 | while (index < map->num_stripes) { | 2655 | while (index < map->num_stripes) { |
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, | |||
2848 | 2883 | ||
2849 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 2884 | static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, |
2850 | u64 logical, u64 *length, | 2885 | u64 logical, u64 *length, |
2851 | struct btrfs_multi_bio **multi_ret, | 2886 | struct btrfs_bio **bbio_ret, |
2852 | int mirror_num) | 2887 | int mirror_num) |
2853 | { | 2888 | { |
2854 | struct extent_map *em; | 2889 | struct extent_map *em; |
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | |||
2866 | int i; | 2901 | int i; |
2867 | int num_stripes; | 2902 | int num_stripes; |
2868 | int max_errors = 0; | 2903 | int max_errors = 0; |
2869 | struct btrfs_multi_bio *multi = NULL; | 2904 | struct btrfs_bio *bbio = NULL; |
2870 | 2905 | ||
2871 | if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) | 2906 | if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) |
2872 | stripes_allocated = 1; | 2907 | stripes_allocated = 1; |
2873 | again: | 2908 | again: |
2874 | if (multi_ret) { | 2909 | if (bbio_ret) { |
2875 | multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), | 2910 | bbio = kzalloc(btrfs_bio_size(stripes_allocated), |
2876 | GFP_NOFS); | 2911 | GFP_NOFS); |
2877 | if (!multi) | 2912 | if (!bbio) |
2878 | return -ENOMEM; | 2913 | return -ENOMEM; |
2879 | 2914 | ||
2880 | atomic_set(&multi->error, 0); | 2915 | atomic_set(&bbio->error, 0); |
2881 | } | 2916 | } |
2882 | 2917 | ||
2883 | read_lock(&em_tree->lock); | 2918 | read_lock(&em_tree->lock); |
@@ -2898,7 +2933,7 @@ again: | |||
2898 | if (mirror_num > map->num_stripes) | 2933 | if (mirror_num > map->num_stripes) |
2899 | mirror_num = 0; | 2934 | mirror_num = 0; |
2900 | 2935 | ||
2901 | /* if our multi bio struct is too small, back off and try again */ | 2936 | /* if our btrfs_bio struct is too small, back off and try again */ |
2902 | if (rw & REQ_WRITE) { | 2937 | if (rw & REQ_WRITE) { |
2903 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 2938 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
2904 | BTRFS_BLOCK_GROUP_DUP)) { | 2939 | BTRFS_BLOCK_GROUP_DUP)) { |
@@ -2917,11 +2952,11 @@ again: | |||
2917 | stripes_required = map->num_stripes; | 2952 | stripes_required = map->num_stripes; |
2918 | } | 2953 | } |
2919 | } | 2954 | } |
2920 | if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && | 2955 | if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && |
2921 | stripes_allocated < stripes_required) { | 2956 | stripes_allocated < stripes_required) { |
2922 | stripes_allocated = map->num_stripes; | 2957 | stripes_allocated = map->num_stripes; |
2923 | free_extent_map(em); | 2958 | free_extent_map(em); |
2924 | kfree(multi); | 2959 | kfree(bbio); |
2925 | goto again; | 2960 | goto again; |
2926 | } | 2961 | } |
2927 | stripe_nr = offset; | 2962 | stripe_nr = offset; |
@@ -2950,7 +2985,7 @@ again: | |||
2950 | *length = em->len - offset; | 2985 | *length = em->len - offset; |
2951 | } | 2986 | } |
2952 | 2987 | ||
2953 | if (!multi_ret) | 2988 | if (!bbio_ret) |
2954 | goto out; | 2989 | goto out; |
2955 | 2990 | ||
2956 | num_stripes = 1; | 2991 | num_stripes = 1; |
@@ -2975,13 +3010,17 @@ again: | |||
2975 | stripe_index = find_live_mirror(map, 0, | 3010 | stripe_index = find_live_mirror(map, 0, |
2976 | map->num_stripes, | 3011 | map->num_stripes, |
2977 | current->pid % map->num_stripes); | 3012 | current->pid % map->num_stripes); |
3013 | mirror_num = stripe_index + 1; | ||
2978 | } | 3014 | } |
2979 | 3015 | ||
2980 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 3016 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
2981 | if (rw & (REQ_WRITE | REQ_DISCARD)) | 3017 | if (rw & (REQ_WRITE | REQ_DISCARD)) { |
2982 | num_stripes = map->num_stripes; | 3018 | num_stripes = map->num_stripes; |
2983 | else if (mirror_num) | 3019 | } else if (mirror_num) { |
2984 | stripe_index = mirror_num - 1; | 3020 | stripe_index = mirror_num - 1; |
3021 | } else { | ||
3022 | mirror_num = 1; | ||
3023 | } | ||
2985 | 3024 | ||
2986 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | 3025 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
2987 | int factor = map->num_stripes / map->sub_stripes; | 3026 | int factor = map->num_stripes / map->sub_stripes; |
@@ -3001,6 +3040,7 @@ again: | |||
3001 | stripe_index = find_live_mirror(map, stripe_index, | 3040 | stripe_index = find_live_mirror(map, stripe_index, |
3002 | map->sub_stripes, stripe_index + | 3041 | map->sub_stripes, stripe_index + |
3003 | current->pid % map->sub_stripes); | 3042 | current->pid % map->sub_stripes); |
3043 | mirror_num = stripe_index + 1; | ||
3004 | } | 3044 | } |
3005 | } else { | 3045 | } else { |
3006 | /* | 3046 | /* |
@@ -3009,15 +3049,16 @@ again: | |||
3009 | * stripe_index is the number of our device in the stripe array | 3049 | * stripe_index is the number of our device in the stripe array |
3010 | */ | 3050 | */ |
3011 | stripe_index = do_div(stripe_nr, map->num_stripes); | 3051 | stripe_index = do_div(stripe_nr, map->num_stripes); |
3052 | mirror_num = stripe_index + 1; | ||
3012 | } | 3053 | } |
3013 | BUG_ON(stripe_index >= map->num_stripes); | 3054 | BUG_ON(stripe_index >= map->num_stripes); |
3014 | 3055 | ||
3015 | if (rw & REQ_DISCARD) { | 3056 | if (rw & REQ_DISCARD) { |
3016 | for (i = 0; i < num_stripes; i++) { | 3057 | for (i = 0; i < num_stripes; i++) { |
3017 | multi->stripes[i].physical = | 3058 | bbio->stripes[i].physical = |
3018 | map->stripes[stripe_index].physical + | 3059 | map->stripes[stripe_index].physical + |
3019 | stripe_offset + stripe_nr * map->stripe_len; | 3060 | stripe_offset + stripe_nr * map->stripe_len; |
3020 | multi->stripes[i].dev = map->stripes[stripe_index].dev; | 3061 | bbio->stripes[i].dev = map->stripes[stripe_index].dev; |
3021 | 3062 | ||
3022 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 3063 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
3023 | u64 stripes; | 3064 | u64 stripes; |
@@ -3038,16 +3079,16 @@ again: | |||
3038 | } | 3079 | } |
3039 | stripes = stripe_nr_end - 1 - j; | 3080 | stripes = stripe_nr_end - 1 - j; |
3040 | do_div(stripes, map->num_stripes); | 3081 | do_div(stripes, map->num_stripes); |
3041 | multi->stripes[i].length = map->stripe_len * | 3082 | bbio->stripes[i].length = map->stripe_len * |
3042 | (stripes - stripe_nr + 1); | 3083 | (stripes - stripe_nr + 1); |
3043 | 3084 | ||
3044 | if (i == 0) { | 3085 | if (i == 0) { |
3045 | multi->stripes[i].length -= | 3086 | bbio->stripes[i].length -= |
3046 | stripe_offset; | 3087 | stripe_offset; |
3047 | stripe_offset = 0; | 3088 | stripe_offset = 0; |
3048 | } | 3089 | } |
3049 | if (stripe_index == last_stripe) | 3090 | if (stripe_index == last_stripe) |
3050 | multi->stripes[i].length -= | 3091 | bbio->stripes[i].length -= |
3051 | stripe_end_offset; | 3092 | stripe_end_offset; |
3052 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | 3093 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
3053 | u64 stripes; | 3094 | u64 stripes; |
@@ -3072,11 +3113,11 @@ again: | |||
3072 | } | 3113 | } |
3073 | stripes = stripe_nr_end - 1 - j; | 3114 | stripes = stripe_nr_end - 1 - j; |
3074 | do_div(stripes, factor); | 3115 | do_div(stripes, factor); |
3075 | multi->stripes[i].length = map->stripe_len * | 3116 | bbio->stripes[i].length = map->stripe_len * |
3076 | (stripes - stripe_nr + 1); | 3117 | (stripes - stripe_nr + 1); |
3077 | 3118 | ||
3078 | if (i < map->sub_stripes) { | 3119 | if (i < map->sub_stripes) { |
3079 | multi->stripes[i].length -= | 3120 | bbio->stripes[i].length -= |
3080 | stripe_offset; | 3121 | stripe_offset; |
3081 | if (i == map->sub_stripes - 1) | 3122 | if (i == map->sub_stripes - 1) |
3082 | stripe_offset = 0; | 3123 | stripe_offset = 0; |
@@ -3084,11 +3125,11 @@ again: | |||
3084 | if (stripe_index >= last_stripe && | 3125 | if (stripe_index >= last_stripe && |
3085 | stripe_index <= (last_stripe + | 3126 | stripe_index <= (last_stripe + |
3086 | map->sub_stripes - 1)) { | 3127 | map->sub_stripes - 1)) { |
3087 | multi->stripes[i].length -= | 3128 | bbio->stripes[i].length -= |
3088 | stripe_end_offset; | 3129 | stripe_end_offset; |
3089 | } | 3130 | } |
3090 | } else | 3131 | } else |
3091 | multi->stripes[i].length = *length; | 3132 | bbio->stripes[i].length = *length; |
3092 | 3133 | ||
3093 | stripe_index++; | 3134 | stripe_index++; |
3094 | if (stripe_index == map->num_stripes) { | 3135 | if (stripe_index == map->num_stripes) { |
@@ -3099,19 +3140,20 @@ again: | |||
3099 | } | 3140 | } |
3100 | } else { | 3141 | } else { |
3101 | for (i = 0; i < num_stripes; i++) { | 3142 | for (i = 0; i < num_stripes; i++) { |
3102 | multi->stripes[i].physical = | 3143 | bbio->stripes[i].physical = |
3103 | map->stripes[stripe_index].physical + | 3144 | map->stripes[stripe_index].physical + |
3104 | stripe_offset + | 3145 | stripe_offset + |
3105 | stripe_nr * map->stripe_len; | 3146 | stripe_nr * map->stripe_len; |
3106 | multi->stripes[i].dev = | 3147 | bbio->stripes[i].dev = |
3107 | map->stripes[stripe_index].dev; | 3148 | map->stripes[stripe_index].dev; |
3108 | stripe_index++; | 3149 | stripe_index++; |
3109 | } | 3150 | } |
3110 | } | 3151 | } |
3111 | if (multi_ret) { | 3152 | if (bbio_ret) { |
3112 | *multi_ret = multi; | 3153 | *bbio_ret = bbio; |
3113 | multi->num_stripes = num_stripes; | 3154 | bbio->num_stripes = num_stripes; |
3114 | multi->max_errors = max_errors; | 3155 | bbio->max_errors = max_errors; |
3156 | bbio->mirror_num = mirror_num; | ||
3115 | } | 3157 | } |
3116 | out: | 3158 | out: |
3117 | free_extent_map(em); | 3159 | free_extent_map(em); |
@@ -3120,9 +3162,9 @@ out: | |||
3120 | 3162 | ||
3121 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 3163 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, |
3122 | u64 logical, u64 *length, | 3164 | u64 logical, u64 *length, |
3123 | struct btrfs_multi_bio **multi_ret, int mirror_num) | 3165 | struct btrfs_bio **bbio_ret, int mirror_num) |
3124 | { | 3166 | { |
3125 | return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, | 3167 | return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret, |
3126 | mirror_num); | 3168 | mirror_num); |
3127 | } | 3169 | } |
3128 | 3170 | ||
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
3191 | return 0; | 3233 | return 0; |
3192 | } | 3234 | } |
3193 | 3235 | ||
3194 | static void end_bio_multi_stripe(struct bio *bio, int err) | 3236 | static void btrfs_end_bio(struct bio *bio, int err) |
3195 | { | 3237 | { |
3196 | struct btrfs_multi_bio *multi = bio->bi_private; | 3238 | struct btrfs_bio *bbio = bio->bi_private; |
3197 | int is_orig_bio = 0; | 3239 | int is_orig_bio = 0; |
3198 | 3240 | ||
3199 | if (err) | 3241 | if (err) |
3200 | atomic_inc(&multi->error); | 3242 | atomic_inc(&bbio->error); |
3201 | 3243 | ||
3202 | if (bio == multi->orig_bio) | 3244 | if (bio == bbio->orig_bio) |
3203 | is_orig_bio = 1; | 3245 | is_orig_bio = 1; |
3204 | 3246 | ||
3205 | if (atomic_dec_and_test(&multi->stripes_pending)) { | 3247 | if (atomic_dec_and_test(&bbio->stripes_pending)) { |
3206 | if (!is_orig_bio) { | 3248 | if (!is_orig_bio) { |
3207 | bio_put(bio); | 3249 | bio_put(bio); |
3208 | bio = multi->orig_bio; | 3250 | bio = bbio->orig_bio; |
3209 | } | 3251 | } |
3210 | bio->bi_private = multi->private; | 3252 | bio->bi_private = bbio->private; |
3211 | bio->bi_end_io = multi->end_io; | 3253 | bio->bi_end_io = bbio->end_io; |
3254 | bio->bi_bdev = (struct block_device *) | ||
3255 | (unsigned long)bbio->mirror_num; | ||
3212 | /* only send an error to the higher layers if it is | 3256 | /* only send an error to the higher layers if it is |
3213 | * beyond the tolerance of the multi-bio | 3257 | * beyond the tolerance of the multi-bio |
3214 | */ | 3258 | */ |
3215 | if (atomic_read(&multi->error) > multi->max_errors) { | 3259 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
3216 | err = -EIO; | 3260 | err = -EIO; |
3217 | } else if (err) { | 3261 | } else if (err) { |
3218 | /* | 3262 | /* |
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err) | |||
3222 | set_bit(BIO_UPTODATE, &bio->bi_flags); | 3266 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
3223 | err = 0; | 3267 | err = 0; |
3224 | } | 3268 | } |
3225 | kfree(multi); | 3269 | kfree(bbio); |
3226 | 3270 | ||
3227 | bio_endio(bio, err); | 3271 | bio_endio(bio, err); |
3228 | } else if (!is_orig_bio) { | 3272 | } else if (!is_orig_bio) { |
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
3302 | u64 logical = (u64)bio->bi_sector << 9; | 3346 | u64 logical = (u64)bio->bi_sector << 9; |
3303 | u64 length = 0; | 3347 | u64 length = 0; |
3304 | u64 map_length; | 3348 | u64 map_length; |
3305 | struct btrfs_multi_bio *multi = NULL; | ||
3306 | int ret; | 3349 | int ret; |
3307 | int dev_nr = 0; | 3350 | int dev_nr = 0; |
3308 | int total_devs = 1; | 3351 | int total_devs = 1; |
3352 | struct btrfs_bio *bbio = NULL; | ||
3309 | 3353 | ||
3310 | length = bio->bi_size; | 3354 | length = bio->bi_size; |
3311 | map_tree = &root->fs_info->mapping_tree; | 3355 | map_tree = &root->fs_info->mapping_tree; |
3312 | map_length = length; | 3356 | map_length = length; |
3313 | 3357 | ||
3314 | ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, | 3358 | ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio, |
3315 | mirror_num); | 3359 | mirror_num); |
3316 | BUG_ON(ret); | 3360 | BUG_ON(ret); |
3317 | 3361 | ||
3318 | total_devs = multi->num_stripes; | 3362 | total_devs = bbio->num_stripes; |
3319 | if (map_length < length) { | 3363 | if (map_length < length) { |
3320 | printk(KERN_CRIT "mapping failed logical %llu bio len %llu " | 3364 | printk(KERN_CRIT "mapping failed logical %llu bio len %llu " |
3321 | "len %llu\n", (unsigned long long)logical, | 3365 | "len %llu\n", (unsigned long long)logical, |
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
3323 | (unsigned long long)map_length); | 3367 | (unsigned long long)map_length); |
3324 | BUG(); | 3368 | BUG(); |
3325 | } | 3369 | } |
3326 | multi->end_io = first_bio->bi_end_io; | 3370 | |
3327 | multi->private = first_bio->bi_private; | 3371 | bbio->orig_bio = first_bio; |
3328 | multi->orig_bio = first_bio; | 3372 | bbio->private = first_bio->bi_private; |
3329 | atomic_set(&multi->stripes_pending, multi->num_stripes); | 3373 | bbio->end_io = first_bio->bi_end_io; |
3374 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
3330 | 3375 | ||
3331 | while (dev_nr < total_devs) { | 3376 | while (dev_nr < total_devs) { |
3332 | if (total_devs > 1) { | 3377 | if (dev_nr < total_devs - 1) { |
3333 | if (dev_nr < total_devs - 1) { | 3378 | bio = bio_clone(first_bio, GFP_NOFS); |
3334 | bio = bio_clone(first_bio, GFP_NOFS); | 3379 | BUG_ON(!bio); |
3335 | BUG_ON(!bio); | 3380 | } else { |
3336 | } else { | 3381 | bio = first_bio; |
3337 | bio = first_bio; | ||
3338 | } | ||
3339 | bio->bi_private = multi; | ||
3340 | bio->bi_end_io = end_bio_multi_stripe; | ||
3341 | } | 3382 | } |
3342 | bio->bi_sector = multi->stripes[dev_nr].physical >> 9; | 3383 | bio->bi_private = bbio; |
3343 | dev = multi->stripes[dev_nr].dev; | 3384 | bio->bi_end_io = btrfs_end_bio; |
3385 | bio->bi_sector = bbio->stripes[dev_nr].physical >> 9; | ||
3386 | dev = bbio->stripes[dev_nr].dev; | ||
3344 | if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { | 3387 | if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { |
3388 | pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu " | ||
3389 | "(%s id %llu), size=%u\n", rw, | ||
3390 | (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, | ||
3391 | dev->name, dev->devid, bio->bi_size); | ||
3345 | bio->bi_bdev = dev->bdev; | 3392 | bio->bi_bdev = dev->bdev; |
3346 | if (async_submit) | 3393 | if (async_submit) |
3347 | schedule_bio(root, dev, rw, bio); | 3394 | schedule_bio(root, dev, rw, bio); |
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
3354 | } | 3401 | } |
3355 | dev_nr++; | 3402 | dev_nr++; |
3356 | } | 3403 | } |
3357 | if (total_devs == 1) | ||
3358 | kfree(multi); | ||
3359 | return 0; | 3404 | return 0; |
3360 | } | 3405 | } |
3361 | 3406 | ||
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root, | |||
3616 | fill_device_from_item(leaf, dev_item, device); | 3661 | fill_device_from_item(leaf, dev_item, device); |
3617 | device->dev_root = root->fs_info->dev_root; | 3662 | device->dev_root = root->fs_info->dev_root; |
3618 | device->in_fs_metadata = 1; | 3663 | device->in_fs_metadata = 1; |
3619 | if (device->writeable) | 3664 | if (device->writeable) { |
3620 | device->fs_devices->total_rw_bytes += device->total_bytes; | 3665 | device->fs_devices->total_rw_bytes += device->total_bytes; |
3666 | spin_lock(&root->fs_info->free_chunk_lock); | ||
3667 | root->fs_info->free_chunk_space += device->total_bytes - | ||
3668 | device->bytes_used; | ||
3669 | spin_unlock(&root->fs_info->free_chunk_lock); | ||
3670 | } | ||
3621 | ret = 0; | 3671 | ret = 0; |
3622 | return ret; | 3672 | return ret; |
3623 | } | 3673 | } |
3624 | 3674 | ||
3625 | int btrfs_read_sys_array(struct btrfs_root *root) | 3675 | int btrfs_read_sys_array(struct btrfs_root *root) |
3626 | { | 3676 | { |
3627 | struct btrfs_super_block *super_copy = &root->fs_info->super_copy; | 3677 | struct btrfs_super_block *super_copy = root->fs_info->super_copy; |
3628 | struct extent_buffer *sb; | 3678 | struct extent_buffer *sb; |
3629 | struct btrfs_disk_key *disk_key; | 3679 | struct btrfs_disk_key *disk_key; |
3630 | struct btrfs_chunk *chunk; | 3680 | struct btrfs_chunk *chunk; |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6d866db4e17..78f2d4d4f37 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -92,6 +92,20 @@ struct btrfs_device { | |||
92 | struct btrfs_work work; | 92 | struct btrfs_work work; |
93 | struct rcu_head rcu; | 93 | struct rcu_head rcu; |
94 | struct work_struct rcu_work; | 94 | struct work_struct rcu_work; |
95 | |||
96 | /* readahead state */ | ||
97 | spinlock_t reada_lock; | ||
98 | atomic_t reada_in_flight; | ||
99 | u64 reada_next; | ||
100 | struct reada_zone *reada_curr_zone; | ||
101 | struct radix_tree_root reada_zones; | ||
102 | struct radix_tree_root reada_extents; | ||
103 | |||
104 | /* for sending down flush barriers */ | ||
105 | struct bio *flush_bio; | ||
106 | struct completion flush_wait; | ||
107 | int nobarriers; | ||
108 | |||
95 | }; | 109 | }; |
96 | 110 | ||
97 | struct btrfs_fs_devices { | 111 | struct btrfs_fs_devices { |
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe { | |||
136 | u64 length; /* only used for discard mappings */ | 150 | u64 length; /* only used for discard mappings */ |
137 | }; | 151 | }; |
138 | 152 | ||
139 | struct btrfs_multi_bio { | 153 | struct btrfs_bio; |
154 | typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); | ||
155 | |||
156 | struct btrfs_bio { | ||
140 | atomic_t stripes_pending; | 157 | atomic_t stripes_pending; |
141 | bio_end_io_t *end_io; | 158 | bio_end_io_t *end_io; |
142 | struct bio *orig_bio; | 159 | struct bio *orig_bio; |
@@ -144,6 +161,7 @@ struct btrfs_multi_bio { | |||
144 | atomic_t error; | 161 | atomic_t error; |
145 | int max_errors; | 162 | int max_errors; |
146 | int num_stripes; | 163 | int num_stripes; |
164 | int mirror_num; | ||
147 | struct btrfs_bio_stripe stripes[]; | 165 | struct btrfs_bio_stripe stripes[]; |
148 | }; | 166 | }; |
149 | 167 | ||
@@ -171,7 +189,7 @@ struct map_lookup { | |||
171 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, | 189 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, |
172 | u64 end, u64 *length); | 190 | u64 end, u64 *length); |
173 | 191 | ||
174 | #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ | 192 | #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ |
175 | (sizeof(struct btrfs_bio_stripe) * (n))) | 193 | (sizeof(struct btrfs_bio_stripe) * (n))) |
176 | 194 | ||
177 | int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | 195 | int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, |
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, | |||
180 | u64 chunk_offset, u64 start, u64 num_bytes); | 198 | u64 chunk_offset, u64 start, u64 num_bytes); |
181 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, | 199 | int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, |
182 | u64 logical, u64 *length, | 200 | u64 logical, u64 *length, |
183 | struct btrfs_multi_bio **multi_ret, int mirror_num); | 201 | struct btrfs_bio **bbio_ret, int mirror_num); |
184 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 202 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
185 | u64 chunk_start, u64 physical, u64 devid, | 203 | u64 chunk_start, u64 physical, u64 devid, |
186 | u64 **logical, int *naddrs, int *stripe_len); | 204 | u64 **logical, int *naddrs, int *stripe_len); |
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 426aa464f1a..3848b04e310 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c | |||
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans, | |||
127 | again: | 127 | again: |
128 | ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), | 128 | ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), |
129 | name, name_len, value, size); | 129 | name, name_len, value, size); |
130 | /* | ||
131 | * If we're setting an xattr to a new value but the new value is say | ||
132 | * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting | ||
133 | * back from split_leaf. This is because it thinks we'll be extending | ||
134 | * the existing item size, but we're asking for enough space to add the | ||
135 | * item itself. So if we get EOVERFLOW just set ret to EEXIST and let | ||
136 | * the rest of the function figure it out. | ||
137 | */ | ||
138 | if (ret == -EOVERFLOW) | ||
139 | ret = -EEXIST; | ||
140 | |||
130 | if (ret == -EEXIST) { | 141 | if (ret == -EEXIST) { |
131 | if (flags & XATTR_CREATE) | 142 | if (flags & XATTR_CREATE) |
132 | goto out; | 143 | goto out; |
diff --git a/fs/buffer.c b/fs/buffer.c index 70a19745cb6..19d8eb7fdc8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -288,7 +288,7 @@ static void free_more_memory(void) | |||
288 | struct zone *zone; | 288 | struct zone *zone; |
289 | int nid; | 289 | int nid; |
290 | 290 | ||
291 | wakeup_flusher_threads(1024); | 291 | wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); |
292 | yield(); | 292 | yield(); |
293 | 293 | ||
294 | for_each_online_node(nid) { | 294 | for_each_online_node(nid) { |
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 2cfb695d1f8..5d9b9acc5fc 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c | |||
@@ -204,7 +204,7 @@ int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, | |||
204 | } | 204 | } |
205 | 205 | ||
206 | /* first calculate 24 bytes ntlm response and then 16 byte session key */ | 206 | /* first calculate 24 bytes ntlm response and then 16 byte session key */ |
207 | int setup_ntlm_response(struct cifs_ses *ses) | 207 | int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) |
208 | { | 208 | { |
209 | int rc = 0; | 209 | int rc = 0; |
210 | unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; | 210 | unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; |
@@ -221,14 +221,14 @@ int setup_ntlm_response(struct cifs_ses *ses) | |||
221 | ses->auth_key.len = temp_len; | 221 | ses->auth_key.len = temp_len; |
222 | 222 | ||
223 | rc = SMBNTencrypt(ses->password, ses->server->cryptkey, | 223 | rc = SMBNTencrypt(ses->password, ses->server->cryptkey, |
224 | ses->auth_key.response + CIFS_SESS_KEY_SIZE); | 224 | ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); |
225 | if (rc) { | 225 | if (rc) { |
226 | cFYI(1, "%s Can't generate NTLM response, error: %d", | 226 | cFYI(1, "%s Can't generate NTLM response, error: %d", |
227 | __func__, rc); | 227 | __func__, rc); |
228 | return rc; | 228 | return rc; |
229 | } | 229 | } |
230 | 230 | ||
231 | rc = E_md4hash(ses->password, temp_key); | 231 | rc = E_md4hash(ses->password, temp_key, nls_cp); |
232 | if (rc) { | 232 | if (rc) { |
233 | cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); | 233 | cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); |
234 | return rc; | 234 | return rc; |
@@ -404,7 +404,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, | |||
404 | } | 404 | } |
405 | 405 | ||
406 | /* calculate md4 hash of password */ | 406 | /* calculate md4 hash of password */ |
407 | E_md4hash(ses->password, nt_hash); | 407 | E_md4hash(ses->password, nt_hash, nls_cp); |
408 | 408 | ||
409 | rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, | 409 | rc = crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, |
410 | CIFS_NTHASH_SIZE); | 410 | CIFS_NTHASH_SIZE); |
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d9dbaf869cd..30ff56005d8 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h | |||
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); | |||
125 | extern const struct export_operations cifs_export_ops; | 125 | extern const struct export_operations cifs_export_ops; |
126 | #endif /* CONFIG_CIFS_NFSD_EXPORT */ | 126 | #endif /* CONFIG_CIFS_NFSD_EXPORT */ |
127 | 127 | ||
128 | #define CIFS_VERSION "1.75" | 128 | #define CIFS_VERSION "1.76" |
129 | #endif /* _CIFSFS_H */ | 129 | #endif /* _CIFSFS_H */ |
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ef4f631e4c0..6f4e243e0f6 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h | |||
@@ -395,8 +395,9 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, | |||
395 | extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, | 395 | extern int cifs_verify_signature(struct kvec *iov, unsigned int nr_iov, |
396 | struct TCP_Server_Info *server, | 396 | struct TCP_Server_Info *server, |
397 | __u32 expected_sequence_number); | 397 | __u32 expected_sequence_number); |
398 | extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); | 398 | extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, |
399 | extern int setup_ntlm_response(struct cifs_ses *); | 399 | const struct nls_table *); |
400 | extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); | ||
400 | extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); | 401 | extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); |
401 | extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); | 402 | extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); |
402 | extern void cifs_crypto_shash_release(struct TCP_Server_Info *); | 403 | extern void cifs_crypto_shash_release(struct TCP_Server_Info *); |
@@ -448,7 +449,8 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, | |||
448 | const unsigned char *path, | 449 | const unsigned char *path, |
449 | struct cifs_sb_info *cifs_sb, int xid); | 450 | struct cifs_sb_info *cifs_sb, int xid); |
450 | extern int mdfour(unsigned char *, unsigned char *, int); | 451 | extern int mdfour(unsigned char *, unsigned char *, int); |
451 | extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); | 452 | extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, |
453 | const struct nls_table *codepage); | ||
452 | extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, | 454 | extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, |
453 | unsigned char *p24); | 455 | unsigned char *p24); |
454 | 456 | ||
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d545a95c30e..8cd4b52d421 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
38 | #include <asm/processor.h> | 38 | #include <asm/processor.h> |
39 | #include <linux/inet.h> | 39 | #include <linux/inet.h> |
40 | #include <linux/module.h> | ||
40 | #include <net/ipv6.h> | 41 | #include <net/ipv6.h> |
41 | #include "cifspdu.h" | 42 | #include "cifspdu.h" |
42 | #include "cifsglob.h" | 43 | #include "cifsglob.h" |
@@ -440,6 +441,8 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig, | |||
440 | smb_msg.msg_controllen = 0; | 441 | smb_msg.msg_controllen = 0; |
441 | 442 | ||
442 | for (total_read = 0; to_read; total_read += length, to_read -= length) { | 443 | for (total_read = 0; to_read; total_read += length, to_read -= length) { |
444 | try_to_freeze(); | ||
445 | |||
443 | if (server_unresponsive(server)) { | 446 | if (server_unresponsive(server)) { |
444 | total_read = -EAGAIN; | 447 | total_read = -EAGAIN; |
445 | break; | 448 | break; |
@@ -3452,7 +3455,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, | |||
3452 | else | 3455 | else |
3453 | #endif /* CIFS_WEAK_PW_HASH */ | 3456 | #endif /* CIFS_WEAK_PW_HASH */ |
3454 | rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, | 3457 | rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, |
3455 | bcc_ptr); | 3458 | bcc_ptr, nls_codepage); |
3456 | 3459 | ||
3457 | bcc_ptr += CIFS_AUTH_RESP_SIZE; | 3460 | bcc_ptr += CIFS_AUTH_RESP_SIZE; |
3458 | if (ses->capabilities & CAP_UNICODE) { | 3461 | if (ses->capabilities & CAP_UNICODE) { |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ea096ce5d4f..4dd9283885e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -645,20 +645,20 @@ int cifs_closedir(struct inode *inode, struct file *file) | |||
645 | } | 645 | } |
646 | 646 | ||
647 | static struct cifsLockInfo * | 647 | static struct cifsLockInfo * |
648 | cifs_lock_init(__u64 len, __u64 offset, __u8 type, __u16 netfid) | 648 | cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 netfid) |
649 | { | 649 | { |
650 | struct cifsLockInfo *li = | 650 | struct cifsLockInfo *lock = |
651 | kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); | 651 | kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); |
652 | if (!li) | 652 | if (!lock) |
653 | return li; | 653 | return lock; |
654 | li->netfid = netfid; | 654 | lock->offset = offset; |
655 | li->offset = offset; | 655 | lock->length = length; |
656 | li->length = len; | 656 | lock->type = type; |
657 | li->type = type; | 657 | lock->netfid = netfid; |
658 | li->pid = current->tgid; | 658 | lock->pid = current->tgid; |
659 | INIT_LIST_HEAD(&li->blist); | 659 | INIT_LIST_HEAD(&lock->blist); |
660 | init_waitqueue_head(&li->block_q); | 660 | init_waitqueue_head(&lock->block_q); |
661 | return li; | 661 | return lock; |
662 | } | 662 | } |
663 | 663 | ||
664 | static void | 664 | static void |
@@ -672,7 +672,7 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) | |||
672 | } | 672 | } |
673 | 673 | ||
674 | static bool | 674 | static bool |
675 | cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, | 675 | __cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, |
676 | __u64 length, __u8 type, __u16 netfid, | 676 | __u64 length, __u8 type, __u16 netfid, |
677 | struct cifsLockInfo **conf_lock) | 677 | struct cifsLockInfo **conf_lock) |
678 | { | 678 | { |
@@ -694,6 +694,21 @@ cifs_find_lock_conflict(struct cifsInodeInfo *cinode, __u64 offset, | |||
694 | return false; | 694 | return false; |
695 | } | 695 | } |
696 | 696 | ||
697 | static bool | ||
698 | cifs_find_lock_conflict(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, | ||
699 | struct cifsLockInfo **conf_lock) | ||
700 | { | ||
701 | return __cifs_find_lock_conflict(cinode, lock->offset, lock->length, | ||
702 | lock->type, lock->netfid, conf_lock); | ||
703 | } | ||
704 | |||
705 | /* | ||
706 | * Check if there is another lock that prevents us to set the lock (mandatory | ||
707 | * style). If such a lock exists, update the flock structure with its | ||
708 | * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks | ||
709 | * or leave it the same if we can't. Returns 0 if we don't need to request to | ||
710 | * the server or 1 otherwise. | ||
711 | */ | ||
697 | static int | 712 | static int |
698 | cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, | 713 | cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, |
699 | __u8 type, __u16 netfid, struct file_lock *flock) | 714 | __u8 type, __u16 netfid, struct file_lock *flock) |
@@ -704,8 +719,8 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, | |||
704 | 719 | ||
705 | mutex_lock(&cinode->lock_mutex); | 720 | mutex_lock(&cinode->lock_mutex); |
706 | 721 | ||
707 | exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, | 722 | exist = __cifs_find_lock_conflict(cinode, offset, length, type, netfid, |
708 | &conf_lock); | 723 | &conf_lock); |
709 | if (exist) { | 724 | if (exist) { |
710 | flock->fl_start = conf_lock->offset; | 725 | flock->fl_start = conf_lock->offset; |
711 | flock->fl_end = conf_lock->offset + conf_lock->length - 1; | 726 | flock->fl_end = conf_lock->offset + conf_lock->length - 1; |
@@ -723,40 +738,33 @@ cifs_lock_test(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, | |||
723 | return rc; | 738 | return rc; |
724 | } | 739 | } |
725 | 740 | ||
726 | static int | 741 | static void |
727 | cifs_lock_add(struct cifsInodeInfo *cinode, __u64 len, __u64 offset, | 742 | cifs_lock_add(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock) |
728 | __u8 type, __u16 netfid) | ||
729 | { | 743 | { |
730 | struct cifsLockInfo *li; | ||
731 | |||
732 | li = cifs_lock_init(len, offset, type, netfid); | ||
733 | if (!li) | ||
734 | return -ENOMEM; | ||
735 | |||
736 | mutex_lock(&cinode->lock_mutex); | 744 | mutex_lock(&cinode->lock_mutex); |
737 | list_add_tail(&li->llist, &cinode->llist); | 745 | list_add_tail(&lock->llist, &cinode->llist); |
738 | mutex_unlock(&cinode->lock_mutex); | 746 | mutex_unlock(&cinode->lock_mutex); |
739 | return 0; | ||
740 | } | 747 | } |
741 | 748 | ||
749 | /* | ||
750 | * Set the byte-range lock (mandatory style). Returns: | ||
751 | * 1) 0, if we set the lock and don't need to request to the server; | ||
752 | * 2) 1, if no locks prevent us but we need to request to the server; | ||
753 | * 3) -EACCESS, if there is a lock that prevents us and wait is false. | ||
754 | */ | ||
742 | static int | 755 | static int |
743 | cifs_lock_add_if(struct cifsInodeInfo *cinode, __u64 offset, __u64 length, | 756 | cifs_lock_add_if(struct cifsInodeInfo *cinode, struct cifsLockInfo *lock, |
744 | __u8 type, __u16 netfid, bool wait) | 757 | bool wait) |
745 | { | 758 | { |
746 | struct cifsLockInfo *lock, *conf_lock; | 759 | struct cifsLockInfo *conf_lock; |
747 | bool exist; | 760 | bool exist; |
748 | int rc = 0; | 761 | int rc = 0; |
749 | 762 | ||
750 | lock = cifs_lock_init(length, offset, type, netfid); | ||
751 | if (!lock) | ||
752 | return -ENOMEM; | ||
753 | |||
754 | try_again: | 763 | try_again: |
755 | exist = false; | 764 | exist = false; |
756 | mutex_lock(&cinode->lock_mutex); | 765 | mutex_lock(&cinode->lock_mutex); |
757 | 766 | ||
758 | exist = cifs_find_lock_conflict(cinode, offset, length, type, netfid, | 767 | exist = cifs_find_lock_conflict(cinode, lock, &conf_lock); |
759 | &conf_lock); | ||
760 | if (!exist && cinode->can_cache_brlcks) { | 768 | if (!exist && cinode->can_cache_brlcks) { |
761 | list_add_tail(&lock->llist, &cinode->llist); | 769 | list_add_tail(&lock->llist, &cinode->llist); |
762 | mutex_unlock(&cinode->lock_mutex); | 770 | mutex_unlock(&cinode->lock_mutex); |
@@ -775,18 +783,21 @@ try_again: | |||
775 | (lock->blist.next == &lock->blist)); | 783 | (lock->blist.next == &lock->blist)); |
776 | if (!rc) | 784 | if (!rc) |
777 | goto try_again; | 785 | goto try_again; |
778 | else { | 786 | mutex_lock(&cinode->lock_mutex); |
779 | mutex_lock(&cinode->lock_mutex); | 787 | list_del_init(&lock->blist); |
780 | list_del_init(&lock->blist); | ||
781 | mutex_unlock(&cinode->lock_mutex); | ||
782 | } | ||
783 | } | 788 | } |
784 | 789 | ||
785 | kfree(lock); | ||
786 | mutex_unlock(&cinode->lock_mutex); | 790 | mutex_unlock(&cinode->lock_mutex); |
787 | return rc; | 791 | return rc; |
788 | } | 792 | } |
789 | 793 | ||
794 | /* | ||
795 | * Check if there is another lock that prevents us to set the lock (posix | ||
796 | * style). If such a lock exists, update the flock structure with its | ||
797 | * properties. Otherwise, set the flock type to F_UNLCK if we can cache brlocks | ||
798 | * or leave it the same if we can't. Returns 0 if we don't need to request to | ||
799 | * the server or 1 otherwise. | ||
800 | */ | ||
790 | static int | 801 | static int |
791 | cifs_posix_lock_test(struct file *file, struct file_lock *flock) | 802 | cifs_posix_lock_test(struct file *file, struct file_lock *flock) |
792 | { | 803 | { |
@@ -794,6 +805,9 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) | |||
794 | struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); | 805 | struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); |
795 | unsigned char saved_type = flock->fl_type; | 806 | unsigned char saved_type = flock->fl_type; |
796 | 807 | ||
808 | if ((flock->fl_flags & FL_POSIX) == 0) | ||
809 | return 1; | ||
810 | |||
797 | mutex_lock(&cinode->lock_mutex); | 811 | mutex_lock(&cinode->lock_mutex); |
798 | posix_test_lock(file, flock); | 812 | posix_test_lock(file, flock); |
799 | 813 | ||
@@ -806,16 +820,25 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock) | |||
806 | return rc; | 820 | return rc; |
807 | } | 821 | } |
808 | 822 | ||
823 | /* | ||
824 | * Set the byte-range lock (posix style). Returns: | ||
825 | * 1) 0, if we set the lock and don't need to request to the server; | ||
826 | * 2) 1, if we need to request to the server; | ||
827 | * 3) <0, if the error occurs while setting the lock. | ||
828 | */ | ||
809 | static int | 829 | static int |
810 | cifs_posix_lock_set(struct file *file, struct file_lock *flock) | 830 | cifs_posix_lock_set(struct file *file, struct file_lock *flock) |
811 | { | 831 | { |
812 | struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); | 832 | struct cifsInodeInfo *cinode = CIFS_I(file->f_path.dentry->d_inode); |
813 | int rc; | 833 | int rc = 1; |
834 | |||
835 | if ((flock->fl_flags & FL_POSIX) == 0) | ||
836 | return rc; | ||
814 | 837 | ||
815 | mutex_lock(&cinode->lock_mutex); | 838 | mutex_lock(&cinode->lock_mutex); |
816 | if (!cinode->can_cache_brlcks) { | 839 | if (!cinode->can_cache_brlcks) { |
817 | mutex_unlock(&cinode->lock_mutex); | 840 | mutex_unlock(&cinode->lock_mutex); |
818 | return 1; | 841 | return rc; |
819 | } | 842 | } |
820 | rc = posix_lock_file_wait(file, flock); | 843 | rc = posix_lock_file_wait(file, flock); |
821 | mutex_unlock(&cinode->lock_mutex); | 844 | mutex_unlock(&cinode->lock_mutex); |
@@ -928,7 +951,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) | |||
928 | else | 951 | else |
929 | type = CIFS_WRLCK; | 952 | type = CIFS_WRLCK; |
930 | 953 | ||
931 | lck = cifs_lock_init(length, flock->fl_start, type, | 954 | lck = cifs_lock_init(flock->fl_start, length, type, |
932 | cfile->netfid); | 955 | cfile->netfid); |
933 | if (!lck) { | 956 | if (!lck) { |
934 | rc = -ENOMEM; | 957 | rc = -ENOMEM; |
@@ -1065,14 +1088,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, | |||
1065 | if (rc != 0) | 1088 | if (rc != 0) |
1066 | cERROR(1, "Error unlocking previously locked " | 1089 | cERROR(1, "Error unlocking previously locked " |
1067 | "range %d during test of lock", rc); | 1090 | "range %d during test of lock", rc); |
1068 | rc = 0; | 1091 | return 0; |
1069 | return rc; | ||
1070 | } | 1092 | } |
1071 | 1093 | ||
1072 | if (type & LOCKING_ANDX_SHARED_LOCK) { | 1094 | if (type & LOCKING_ANDX_SHARED_LOCK) { |
1073 | flock->fl_type = F_WRLCK; | 1095 | flock->fl_type = F_WRLCK; |
1074 | rc = 0; | 1096 | return 0; |
1075 | return rc; | ||
1076 | } | 1097 | } |
1077 | 1098 | ||
1078 | rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, | 1099 | rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, |
@@ -1090,8 +1111,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u8 type, | |||
1090 | } else | 1111 | } else |
1091 | flock->fl_type = F_WRLCK; | 1112 | flock->fl_type = F_WRLCK; |
1092 | 1113 | ||
1093 | rc = 0; | 1114 | return 0; |
1094 | return rc; | ||
1095 | } | 1115 | } |
1096 | 1116 | ||
1097 | static void | 1117 | static void |
@@ -1249,20 +1269,26 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u8 type, | |||
1249 | } | 1269 | } |
1250 | 1270 | ||
1251 | if (lock) { | 1271 | if (lock) { |
1252 | rc = cifs_lock_add_if(cinode, flock->fl_start, length, | 1272 | struct cifsLockInfo *lock; |
1253 | type, netfid, wait_flag); | 1273 | |
1274 | lock = cifs_lock_init(flock->fl_start, length, type, netfid); | ||
1275 | if (!lock) | ||
1276 | return -ENOMEM; | ||
1277 | |||
1278 | rc = cifs_lock_add_if(cinode, lock, wait_flag); | ||
1254 | if (rc < 0) | 1279 | if (rc < 0) |
1255 | return rc; | 1280 | kfree(lock); |
1256 | else if (!rc) | 1281 | if (rc <= 0) |
1257 | goto out; | 1282 | goto out; |
1258 | 1283 | ||
1259 | rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, | 1284 | rc = CIFSSMBLock(xid, tcon, netfid, current->tgid, length, |
1260 | flock->fl_start, 0, 1, type, wait_flag, 0); | 1285 | flock->fl_start, 0, 1, type, wait_flag, 0); |
1261 | if (rc == 0) { | 1286 | if (rc) { |
1262 | /* For Windows locks we must store them. */ | 1287 | kfree(lock); |
1263 | rc = cifs_lock_add(cinode, length, flock->fl_start, | 1288 | goto out; |
1264 | type, netfid); | ||
1265 | } | 1289 | } |
1290 | |||
1291 | cifs_lock_add(cinode, lock); | ||
1266 | } else if (unlock) | 1292 | } else if (unlock) |
1267 | rc = cifs_unlock_range(cfile, flock, xid); | 1293 | rc = cifs_unlock_range(cfile, flock, xid); |
1268 | 1294 | ||
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5de03ec2014..a090bbe6ee2 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c | |||
@@ -554,7 +554,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, | |||
554 | rc); | 554 | rc); |
555 | return rc; | 555 | return rc; |
556 | } | 556 | } |
557 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); | 557 | /* FindFirst/Next set last_entry to NULL on malformed reply */ |
558 | if (cifsFile->srch_inf.last_entry) | ||
559 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, | ||
560 | cifsFile); | ||
558 | } | 561 | } |
559 | 562 | ||
560 | while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && | 563 | while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && |
@@ -562,7 +565,10 @@ static int find_cifs_entry(const int xid, struct cifs_tcon *pTcon, | |||
562 | cFYI(1, "calling findnext2"); | 565 | cFYI(1, "calling findnext2"); |
563 | rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, | 566 | rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, |
564 | &cifsFile->srch_inf); | 567 | &cifsFile->srch_inf); |
565 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); | 568 | /* FindFirst/Next set last_entry to NULL on malformed reply */ |
569 | if (cifsFile->srch_inf.last_entry) | ||
570 | cifs_save_resume_key(cifsFile->srch_inf.last_entry, | ||
571 | cifsFile); | ||
566 | if (rc) | 572 | if (rc) |
567 | return -ENOENT; | 573 | return -ENOENT; |
568 | } | 574 | } |
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c7d80e24f24..4ec3ee9d72c 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c | |||
@@ -683,7 +683,7 @@ ssetup_ntlmssp_authenticate: | |||
683 | cpu_to_le16(CIFS_AUTH_RESP_SIZE); | 683 | cpu_to_le16(CIFS_AUTH_RESP_SIZE); |
684 | 684 | ||
685 | /* calculate ntlm response and session key */ | 685 | /* calculate ntlm response and session key */ |
686 | rc = setup_ntlm_response(ses); | 686 | rc = setup_ntlm_response(ses, nls_cp); |
687 | if (rc) { | 687 | if (rc) { |
688 | cERROR(1, "Error %d during NTLM authentication", rc); | 688 | cERROR(1, "Error %d during NTLM authentication", rc); |
689 | goto ssetup_exit; | 689 | goto ssetup_exit; |
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index ac1221d969d..80d85088193 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c | |||
@@ -199,75 +199,36 @@ SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) | |||
199 | return rc; | 199 | return rc; |
200 | } | 200 | } |
201 | 201 | ||
202 | /* Routines for Windows NT MD4 Hash functions. */ | ||
203 | static int | ||
204 | _my_wcslen(__u16 *str) | ||
205 | { | ||
206 | int len = 0; | ||
207 | while (*str++ != 0) | ||
208 | len++; | ||
209 | return len; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * Convert a string into an NT UNICODE string. | ||
214 | * Note that regardless of processor type | ||
215 | * this must be in intel (little-endian) | ||
216 | * format. | ||
217 | */ | ||
218 | |||
219 | static int | ||
220 | _my_mbstowcs(__u16 *dst, const unsigned char *src, int len) | ||
221 | { /* BB not a very good conversion routine - change/fix */ | ||
222 | int i; | ||
223 | __u16 val; | ||
224 | |||
225 | for (i = 0; i < len; i++) { | ||
226 | val = *src; | ||
227 | SSVAL(dst, 0, val); | ||
228 | dst++; | ||
229 | src++; | ||
230 | if (val == 0) | ||
231 | break; | ||
232 | } | ||
233 | return i; | ||
234 | } | ||
235 | |||
236 | /* | 202 | /* |
237 | * Creates the MD4 Hash of the users password in NT UNICODE. | 203 | * Creates the MD4 Hash of the users password in NT UNICODE. |
238 | */ | 204 | */ |
239 | 205 | ||
240 | int | 206 | int |
241 | E_md4hash(const unsigned char *passwd, unsigned char *p16) | 207 | E_md4hash(const unsigned char *passwd, unsigned char *p16, |
208 | const struct nls_table *codepage) | ||
242 | { | 209 | { |
243 | int rc; | 210 | int rc; |
244 | int len; | 211 | int len; |
245 | __u16 wpwd[129]; | 212 | __le16 wpwd[129]; |
246 | 213 | ||
247 | /* Password cannot be longer than 128 characters */ | 214 | /* Password cannot be longer than 128 characters */ |
248 | if (passwd) { | 215 | if (passwd) /* Password must be converted to NT unicode */ |
249 | len = strlen((char *) passwd); | 216 | len = cifs_strtoUCS(wpwd, passwd, 128, codepage); |
250 | if (len > 128) | 217 | else { |
251 | len = 128; | ||
252 | |||
253 | /* Password must be converted to NT unicode */ | ||
254 | _my_mbstowcs(wpwd, passwd, len); | ||
255 | } else | ||
256 | len = 0; | 218 | len = 0; |
219 | *wpwd = 0; /* Ensure string is null terminated */ | ||
220 | } | ||
257 | 221 | ||
258 | wpwd[len] = 0; /* Ensure string is null terminated */ | 222 | rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); |
259 | /* Calculate length in bytes */ | 223 | memset(wpwd, 0, 129 * sizeof(__le16)); |
260 | len = _my_wcslen(wpwd) * sizeof(__u16); | ||
261 | |||
262 | rc = mdfour(p16, (unsigned char *) wpwd, len); | ||
263 | memset(wpwd, 0, 129 * 2); | ||
264 | 224 | ||
265 | return rc; | 225 | return rc; |
266 | } | 226 | } |
267 | 227 | ||
268 | /* Does the NT MD4 hash then des encryption. */ | 228 | /* Does the NT MD4 hash then des encryption. */ |
269 | int | 229 | int |
270 | SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) | 230 | SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, |
231 | const struct nls_table *codepage) | ||
271 | { | 232 | { |
272 | int rc; | 233 | int rc; |
273 | unsigned char p16[16], p21[21]; | 234 | unsigned char p16[16], p21[21]; |
@@ -275,7 +236,7 @@ SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) | |||
275 | memset(p16, '\0', 16); | 236 | memset(p16, '\0', 16); |
276 | memset(p21, '\0', 21); | 237 | memset(p21, '\0', 21); |
277 | 238 | ||
278 | rc = E_md4hash(passwd, p16); | 239 | rc = E_md4hash(passwd, p16, codepage); |
279 | if (rc) { | 240 | if (rc) { |
280 | cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); | 241 | cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); |
281 | return rc; | 242 | return rc; |
diff --git a/fs/dcache.c b/fs/dcache.c index 274f13e2f09..89509b5a090 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/bit_spinlock.h> | 36 | #include <linux/bit_spinlock.h> |
37 | #include <linux/rculist_bl.h> | 37 | #include <linux/rculist_bl.h> |
38 | #include <linux/prefetch.h> | 38 | #include <linux/prefetch.h> |
39 | #include <linux/ratelimit.h> | ||
39 | #include "internal.h" | 40 | #include "internal.h" |
40 | 41 | ||
41 | /* | 42 | /* |
@@ -546,9 +547,11 @@ int d_invalidate(struct dentry * dentry) | |||
546 | * would make it unreachable from the root, | 547 | * would make it unreachable from the root, |
547 | * we might still populate it if it was a | 548 | * we might still populate it if it was a |
548 | * working directory or similar). | 549 | * working directory or similar). |
550 | * We also need to leave mountpoints alone, | ||
551 | * directory or not. | ||
549 | */ | 552 | */ |
550 | if (dentry->d_count > 1) { | 553 | if (dentry->d_count > 1 && dentry->d_inode) { |
551 | if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) { | 554 | if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { |
552 | spin_unlock(&dentry->d_lock); | 555 | spin_unlock(&dentry->d_lock); |
553 | return -EBUSY; | 556 | return -EBUSY; |
554 | } | 557 | } |
@@ -2381,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) | |||
2381 | actual = __d_unalias(inode, dentry, alias); | 2384 | actual = __d_unalias(inode, dentry, alias); |
2382 | } | 2385 | } |
2383 | write_sequnlock(&rename_lock); | 2386 | write_sequnlock(&rename_lock); |
2384 | if (IS_ERR(actual)) | 2387 | if (IS_ERR(actual)) { |
2388 | if (PTR_ERR(actual) == -ELOOP) | ||
2389 | pr_warn_ratelimited( | ||
2390 | "VFS: Lookup of '%s' in %s %s" | ||
2391 | " would have caused loop\n", | ||
2392 | dentry->d_name.name, | ||
2393 | inode->i_sb->s_type->name, | ||
2394 | inode->i_sb->s_id); | ||
2385 | dput(alias); | 2395 | dput(alias); |
2396 | } | ||
2386 | goto out_nolock; | 2397 | goto out_nolock; |
2387 | } | 2398 | } |
2388 | } | 2399 | } |
@@ -2428,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) | |||
2428 | /** | 2439 | /** |
2429 | * prepend_path - Prepend path string to a buffer | 2440 | * prepend_path - Prepend path string to a buffer |
2430 | * @path: the dentry/vfsmount to report | 2441 | * @path: the dentry/vfsmount to report |
2431 | * @root: root vfsmnt/dentry (may be modified by this function) | 2442 | * @root: root vfsmnt/dentry |
2432 | * @buffer: pointer to the end of the buffer | 2443 | * @buffer: pointer to the end of the buffer |
2433 | * @buflen: pointer to buffer length | 2444 | * @buflen: pointer to buffer length |
2434 | * | 2445 | * |
2435 | * Caller holds the rename_lock. | 2446 | * Caller holds the rename_lock. |
2436 | * | ||
2437 | * If path is not reachable from the supplied root, then the value of | ||
2438 | * root is changed (without modifying refcounts). | ||
2439 | */ | 2447 | */ |
2440 | static int prepend_path(const struct path *path, struct path *root, | 2448 | static int prepend_path(const struct path *path, |
2449 | const struct path *root, | ||
2441 | char **buffer, int *buflen) | 2450 | char **buffer, int *buflen) |
2442 | { | 2451 | { |
2443 | struct dentry *dentry = path->dentry; | 2452 | struct dentry *dentry = path->dentry; |
@@ -2472,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, | |||
2472 | dentry = parent; | 2481 | dentry = parent; |
2473 | } | 2482 | } |
2474 | 2483 | ||
2475 | out: | ||
2476 | if (!error && !slash) | 2484 | if (!error && !slash) |
2477 | error = prepend(buffer, buflen, "/", 1); | 2485 | error = prepend(buffer, buflen, "/", 1); |
2478 | 2486 | ||
2487 | out: | ||
2479 | br_read_unlock(vfsmount_lock); | 2488 | br_read_unlock(vfsmount_lock); |
2480 | return error; | 2489 | return error; |
2481 | 2490 | ||
@@ -2489,15 +2498,17 @@ global_root: | |||
2489 | WARN(1, "Root dentry has weird name <%.*s>\n", | 2498 | WARN(1, "Root dentry has weird name <%.*s>\n", |
2490 | (int) dentry->d_name.len, dentry->d_name.name); | 2499 | (int) dentry->d_name.len, dentry->d_name.name); |
2491 | } | 2500 | } |
2492 | root->mnt = vfsmnt; | 2501 | if (!slash) |
2493 | root->dentry = dentry; | 2502 | error = prepend(buffer, buflen, "/", 1); |
2503 | if (!error) | ||
2504 | error = vfsmnt->mnt_ns ? 1 : 2; | ||
2494 | goto out; | 2505 | goto out; |
2495 | } | 2506 | } |
2496 | 2507 | ||
2497 | /** | 2508 | /** |
2498 | * __d_path - return the path of a dentry | 2509 | * __d_path - return the path of a dentry |
2499 | * @path: the dentry/vfsmount to report | 2510 | * @path: the dentry/vfsmount to report |
2500 | * @root: root vfsmnt/dentry (may be modified by this function) | 2511 | * @root: root vfsmnt/dentry |
2501 | * @buf: buffer to return value in | 2512 | * @buf: buffer to return value in |
2502 | * @buflen: buffer length | 2513 | * @buflen: buffer length |
2503 | * | 2514 | * |
@@ -2508,10 +2519,10 @@ global_root: | |||
2508 | * | 2519 | * |
2509 | * "buflen" should be positive. | 2520 | * "buflen" should be positive. |
2510 | * | 2521 | * |
2511 | * If path is not reachable from the supplied root, then the value of | 2522 | * If the path is not reachable from the supplied root, return %NULL. |
2512 | * root is changed (without modifying refcounts). | ||
2513 | */ | 2523 | */ |
2514 | char *__d_path(const struct path *path, struct path *root, | 2524 | char *__d_path(const struct path *path, |
2525 | const struct path *root, | ||
2515 | char *buf, int buflen) | 2526 | char *buf, int buflen) |
2516 | { | 2527 | { |
2517 | char *res = buf + buflen; | 2528 | char *res = buf + buflen; |
@@ -2522,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, | |||
2522 | error = prepend_path(path, root, &res, &buflen); | 2533 | error = prepend_path(path, root, &res, &buflen); |
2523 | write_sequnlock(&rename_lock); | 2534 | write_sequnlock(&rename_lock); |
2524 | 2535 | ||
2525 | if (error) | 2536 | if (error < 0) |
2537 | return ERR_PTR(error); | ||
2538 | if (error > 0) | ||
2539 | return NULL; | ||
2540 | return res; | ||
2541 | } | ||
2542 | |||
2543 | char *d_absolute_path(const struct path *path, | ||
2544 | char *buf, int buflen) | ||
2545 | { | ||
2546 | struct path root = {}; | ||
2547 | char *res = buf + buflen; | ||
2548 | int error; | ||
2549 | |||
2550 | prepend(&res, &buflen, "\0", 1); | ||
2551 | write_seqlock(&rename_lock); | ||
2552 | error = prepend_path(path, &root, &res, &buflen); | ||
2553 | write_sequnlock(&rename_lock); | ||
2554 | |||
2555 | if (error > 1) | ||
2556 | error = -EINVAL; | ||
2557 | if (error < 0) | ||
2526 | return ERR_PTR(error); | 2558 | return ERR_PTR(error); |
2527 | return res; | 2559 | return res; |
2528 | } | 2560 | } |
@@ -2530,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, | |||
2530 | /* | 2562 | /* |
2531 | * same as __d_path but appends "(deleted)" for unlinked files. | 2563 | * same as __d_path but appends "(deleted)" for unlinked files. |
2532 | */ | 2564 | */ |
2533 | static int path_with_deleted(const struct path *path, struct path *root, | 2565 | static int path_with_deleted(const struct path *path, |
2534 | char **buf, int *buflen) | 2566 | const struct path *root, |
2567 | char **buf, int *buflen) | ||
2535 | { | 2568 | { |
2536 | prepend(buf, buflen, "\0", 1); | 2569 | prepend(buf, buflen, "\0", 1); |
2537 | if (d_unlinked(path->dentry)) { | 2570 | if (d_unlinked(path->dentry)) { |
@@ -2568,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) | |||
2568 | { | 2601 | { |
2569 | char *res = buf + buflen; | 2602 | char *res = buf + buflen; |
2570 | struct path root; | 2603 | struct path root; |
2571 | struct path tmp; | ||
2572 | int error; | 2604 | int error; |
2573 | 2605 | ||
2574 | /* | 2606 | /* |
@@ -2583,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) | |||
2583 | 2615 | ||
2584 | get_fs_root(current->fs, &root); | 2616 | get_fs_root(current->fs, &root); |
2585 | write_seqlock(&rename_lock); | 2617 | write_seqlock(&rename_lock); |
2586 | tmp = root; | 2618 | error = path_with_deleted(path, &root, &res, &buflen); |
2587 | error = path_with_deleted(path, &tmp, &res, &buflen); | 2619 | if (error < 0) |
2588 | if (error) | ||
2589 | res = ERR_PTR(error); | 2620 | res = ERR_PTR(error); |
2590 | write_sequnlock(&rename_lock); | 2621 | write_sequnlock(&rename_lock); |
2591 | path_put(&root); | 2622 | path_put(&root); |
@@ -2606,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) | |||
2606 | { | 2637 | { |
2607 | char *res = buf + buflen; | 2638 | char *res = buf + buflen; |
2608 | struct path root; | 2639 | struct path root; |
2609 | struct path tmp; | ||
2610 | int error; | 2640 | int error; |
2611 | 2641 | ||
2612 | if (path->dentry->d_op && path->dentry->d_op->d_dname) | 2642 | if (path->dentry->d_op && path->dentry->d_op->d_dname) |
@@ -2614,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) | |||
2614 | 2644 | ||
2615 | get_fs_root(current->fs, &root); | 2645 | get_fs_root(current->fs, &root); |
2616 | write_seqlock(&rename_lock); | 2646 | write_seqlock(&rename_lock); |
2617 | tmp = root; | 2647 | error = path_with_deleted(path, &root, &res, &buflen); |
2618 | error = path_with_deleted(path, &tmp, &res, &buflen); | 2648 | if (error > 0) |
2619 | if (!error && !path_equal(&tmp, &root)) | ||
2620 | error = prepend_unreachable(&res, &buflen); | 2649 | error = prepend_unreachable(&res, &buflen); |
2621 | write_sequnlock(&rename_lock); | 2650 | write_sequnlock(&rename_lock); |
2622 | path_put(&root); | 2651 | path_put(&root); |
@@ -2747,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) | |||
2747 | write_seqlock(&rename_lock); | 2776 | write_seqlock(&rename_lock); |
2748 | if (!d_unlinked(pwd.dentry)) { | 2777 | if (!d_unlinked(pwd.dentry)) { |
2749 | unsigned long len; | 2778 | unsigned long len; |
2750 | struct path tmp = root; | ||
2751 | char *cwd = page + PAGE_SIZE; | 2779 | char *cwd = page + PAGE_SIZE; |
2752 | int buflen = PAGE_SIZE; | 2780 | int buflen = PAGE_SIZE; |
2753 | 2781 | ||
2754 | prepend(&cwd, &buflen, "\0", 1); | 2782 | prepend(&cwd, &buflen, "\0", 1); |
2755 | error = prepend_path(&pwd, &tmp, &cwd, &buflen); | 2783 | error = prepend_path(&pwd, &root, &cwd, &buflen); |
2756 | write_sequnlock(&rename_lock); | 2784 | write_sequnlock(&rename_lock); |
2757 | 2785 | ||
2758 | if (error) | 2786 | if (error < 0) |
2759 | goto out; | 2787 | goto out; |
2760 | 2788 | ||
2761 | /* Unreachable from current root */ | 2789 | /* Unreachable from current root */ |
2762 | if (!path_equal(&tmp, &root)) { | 2790 | if (error > 0) { |
2763 | error = prepend_unreachable(&cwd, &buflen); | 2791 | error = prepend_unreachable(&cwd, &buflen); |
2764 | if (error) | 2792 | if (error) |
2765 | goto out; | 2793 | goto out; |
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bde3b9..2a834255c75 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c | |||
@@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( | |||
967 | 967 | ||
968 | /** | 968 | /** |
969 | * ecryptfs_new_file_context | 969 | * ecryptfs_new_file_context |
970 | * @ecryptfs_dentry: The eCryptfs dentry | 970 | * @ecryptfs_inode: The eCryptfs inode |
971 | * | 971 | * |
972 | * If the crypto context for the file has not yet been established, | 972 | * If the crypto context for the file has not yet been established, |
973 | * this is where we do that. Establishing a new crypto context | 973 | * this is where we do that. Establishing a new crypto context |
@@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( | |||
984 | * | 984 | * |
985 | * Returns zero on success; non-zero otherwise | 985 | * Returns zero on success; non-zero otherwise |
986 | */ | 986 | */ |
987 | int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) | 987 | int ecryptfs_new_file_context(struct inode *ecryptfs_inode) |
988 | { | 988 | { |
989 | struct ecryptfs_crypt_stat *crypt_stat = | 989 | struct ecryptfs_crypt_stat *crypt_stat = |
990 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 990 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
991 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = | 991 | struct ecryptfs_mount_crypt_stat *mount_crypt_stat = |
992 | &ecryptfs_superblock_to_private( | 992 | &ecryptfs_superblock_to_private( |
993 | ecryptfs_dentry->d_sb)->mount_crypt_stat; | 993 | ecryptfs_inode->i_sb)->mount_crypt_stat; |
994 | int cipher_name_len; | 994 | int cipher_name_len; |
995 | int rc = 0; | 995 | int rc = 0; |
996 | 996 | ||
@@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, | |||
1299 | } | 1299 | } |
1300 | 1300 | ||
1301 | static int | 1301 | static int |
1302 | ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, | 1302 | ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, |
1303 | char *virt, size_t virt_len) | 1303 | char *virt, size_t virt_len) |
1304 | { | 1304 | { |
1305 | int rc; | 1305 | int rc; |
1306 | 1306 | ||
1307 | rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, | 1307 | rc = ecryptfs_write_lower(ecryptfs_inode, virt, |
1308 | 0, virt_len); | 1308 | 0, virt_len); |
1309 | if (rc < 0) | 1309 | if (rc < 0) |
1310 | printk(KERN_ERR "%s: Error attempting to write header " | 1310 | printk(KERN_ERR "%s: Error attempting to write header " |
@@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, | |||
1338 | 1338 | ||
1339 | /** | 1339 | /** |
1340 | * ecryptfs_write_metadata | 1340 | * ecryptfs_write_metadata |
1341 | * @ecryptfs_dentry: The eCryptfs dentry | 1341 | * @ecryptfs_dentry: The eCryptfs dentry, which should be negative |
1342 | * @ecryptfs_inode: The newly created eCryptfs inode | ||
1342 | * | 1343 | * |
1343 | * Write the file headers out. This will likely involve a userspace | 1344 | * Write the file headers out. This will likely involve a userspace |
1344 | * callout, in which the session key is encrypted with one or more | 1345 | * callout, in which the session key is encrypted with one or more |
@@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, | |||
1348 | * | 1349 | * |
1349 | * Returns zero on success; non-zero on error | 1350 | * Returns zero on success; non-zero on error |
1350 | */ | 1351 | */ |
1351 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) | 1352 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, |
1353 | struct inode *ecryptfs_inode) | ||
1352 | { | 1354 | { |
1353 | struct ecryptfs_crypt_stat *crypt_stat = | 1355 | struct ecryptfs_crypt_stat *crypt_stat = |
1354 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 1356 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
1355 | unsigned int order; | 1357 | unsigned int order; |
1356 | char *virt; | 1358 | char *virt; |
1357 | size_t virt_len; | 1359 | size_t virt_len; |
@@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) | |||
1391 | rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, | 1393 | rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, |
1392 | size); | 1394 | size); |
1393 | else | 1395 | else |
1394 | rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, | 1396 | rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, |
1395 | virt_len); | 1397 | virt_len); |
1396 | if (rc) { | 1398 | if (rc) { |
1397 | printk(KERN_ERR "%s: Error writing metadata out to lower file; " | 1399 | printk(KERN_ERR "%s: Error writing metadata out to lower file; " |
@@ -1943,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" | |||
1943 | 1945 | ||
1944 | /* We could either offset on every reverse map or just pad some 0x00's | 1946 | /* We could either offset on every reverse map or just pad some 0x00's |
1945 | * at the front here */ | 1947 | * at the front here */ |
1946 | static const unsigned char filename_rev_map[] = { | 1948 | static const unsigned char filename_rev_map[256] = { |
1947 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ | 1949 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ |
1948 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ | 1950 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ |
1949 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ | 1951 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ |
@@ -1959,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { | |||
1959 | 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ | 1961 | 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ |
1960 | 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ | 1962 | 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ |
1961 | 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ | 1963 | 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ |
1962 | 0x3D, 0x3E, 0x3F | 1964 | 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ |
1963 | }; | 1965 | }; |
1964 | 1966 | ||
1965 | /** | 1967 | /** |
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 54481a3b2c7..a9f29b12fbf 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h | |||
@@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); | |||
584 | int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); | 584 | int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); |
585 | int ecryptfs_encrypt_page(struct page *page); | 585 | int ecryptfs_encrypt_page(struct page *page); |
586 | int ecryptfs_decrypt_page(struct page *page); | 586 | int ecryptfs_decrypt_page(struct page *page); |
587 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); | 587 | int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, |
588 | struct inode *ecryptfs_inode); | ||
588 | int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); | 589 | int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); |
589 | int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); | 590 | int ecryptfs_new_file_context(struct inode *ecryptfs_inode); |
590 | void ecryptfs_write_crypt_stat_flags(char *page_virt, | 591 | void ecryptfs_write_crypt_stat_flags(char *page_virt, |
591 | struct ecryptfs_crypt_stat *crypt_stat, | 592 | struct ecryptfs_crypt_stat *crypt_stat, |
592 | size_t *written); | 593 | size_t *written); |
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98cf9ba..d3f95f941c4 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c | |||
@@ -139,6 +139,27 @@ out: | |||
139 | return rc; | 139 | return rc; |
140 | } | 140 | } |
141 | 141 | ||
142 | static void ecryptfs_vma_close(struct vm_area_struct *vma) | ||
143 | { | ||
144 | filemap_write_and_wait(vma->vm_file->f_mapping); | ||
145 | } | ||
146 | |||
147 | static const struct vm_operations_struct ecryptfs_file_vm_ops = { | ||
148 | .close = ecryptfs_vma_close, | ||
149 | .fault = filemap_fault, | ||
150 | }; | ||
151 | |||
152 | static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) | ||
153 | { | ||
154 | int rc; | ||
155 | |||
156 | rc = generic_file_mmap(file, vma); | ||
157 | if (!rc) | ||
158 | vma->vm_ops = &ecryptfs_file_vm_ops; | ||
159 | |||
160 | return rc; | ||
161 | } | ||
162 | |||
142 | struct kmem_cache *ecryptfs_file_info_cache; | 163 | struct kmem_cache *ecryptfs_file_info_cache; |
143 | 164 | ||
144 | /** | 165 | /** |
@@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { | |||
349 | #ifdef CONFIG_COMPAT | 370 | #ifdef CONFIG_COMPAT |
350 | .compat_ioctl = ecryptfs_compat_ioctl, | 371 | .compat_ioctl = ecryptfs_compat_ioctl, |
351 | #endif | 372 | #endif |
352 | .mmap = generic_file_mmap, | 373 | .mmap = ecryptfs_file_mmap, |
353 | .open = ecryptfs_open, | 374 | .open = ecryptfs_open, |
354 | .flush = ecryptfs_flush, | 375 | .flush = ecryptfs_flush, |
355 | .release = ecryptfs_release, | 376 | .release = ecryptfs_release, |
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a36d327f152..32f90a3ae63 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c | |||
@@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, | |||
172 | * it. It will also update the eCryptfs directory inode to mimic the | 172 | * it. It will also update the eCryptfs directory inode to mimic the |
173 | * stat of the lower directory inode. | 173 | * stat of the lower directory inode. |
174 | * | 174 | * |
175 | * Returns zero on success; non-zero on error condition | 175 | * Returns the new eCryptfs inode on success; an ERR_PTR on error condition |
176 | */ | 176 | */ |
177 | static int | 177 | static struct inode * |
178 | ecryptfs_do_create(struct inode *directory_inode, | 178 | ecryptfs_do_create(struct inode *directory_inode, |
179 | struct dentry *ecryptfs_dentry, int mode) | 179 | struct dentry *ecryptfs_dentry, int mode) |
180 | { | 180 | { |
181 | int rc; | 181 | int rc; |
182 | struct dentry *lower_dentry; | 182 | struct dentry *lower_dentry; |
183 | struct dentry *lower_dir_dentry; | 183 | struct dentry *lower_dir_dentry; |
184 | struct inode *inode; | ||
184 | 185 | ||
185 | lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); | 186 | lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); |
186 | lower_dir_dentry = lock_parent(lower_dentry); | 187 | lower_dir_dentry = lock_parent(lower_dentry); |
187 | if (IS_ERR(lower_dir_dentry)) { | 188 | if (IS_ERR(lower_dir_dentry)) { |
188 | ecryptfs_printk(KERN_ERR, "Error locking directory of " | 189 | ecryptfs_printk(KERN_ERR, "Error locking directory of " |
189 | "dentry\n"); | 190 | "dentry\n"); |
190 | rc = PTR_ERR(lower_dir_dentry); | 191 | inode = ERR_CAST(lower_dir_dentry); |
191 | goto out; | 192 | goto out; |
192 | } | 193 | } |
193 | rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, | 194 | rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, |
@@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, | |||
195 | if (rc) { | 196 | if (rc) { |
196 | printk(KERN_ERR "%s: Failure to create dentry in lower fs; " | 197 | printk(KERN_ERR "%s: Failure to create dentry in lower fs; " |
197 | "rc = [%d]\n", __func__, rc); | 198 | "rc = [%d]\n", __func__, rc); |
199 | inode = ERR_PTR(rc); | ||
198 | goto out_lock; | 200 | goto out_lock; |
199 | } | 201 | } |
200 | rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, | 202 | inode = __ecryptfs_get_inode(lower_dentry->d_inode, |
201 | directory_inode->i_sb); | 203 | directory_inode->i_sb); |
202 | if (rc) { | 204 | if (IS_ERR(inode)) |
203 | ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); | ||
204 | goto out_lock; | 205 | goto out_lock; |
205 | } | ||
206 | fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); | 206 | fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); |
207 | fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); | 207 | fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); |
208 | out_lock: | 208 | out_lock: |
209 | unlock_dir(lower_dir_dentry); | 209 | unlock_dir(lower_dir_dentry); |
210 | out: | 210 | out: |
211 | return rc; | 211 | return inode; |
212 | } | 212 | } |
213 | 213 | ||
214 | /** | 214 | /** |
@@ -219,26 +219,26 @@ out: | |||
219 | * | 219 | * |
220 | * Returns zero on success | 220 | * Returns zero on success |
221 | */ | 221 | */ |
222 | static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) | 222 | static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, |
223 | struct inode *ecryptfs_inode) | ||
223 | { | 224 | { |
224 | struct ecryptfs_crypt_stat *crypt_stat = | 225 | struct ecryptfs_crypt_stat *crypt_stat = |
225 | &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; | 226 | &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; |
226 | int rc = 0; | 227 | int rc = 0; |
227 | 228 | ||
228 | if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { | 229 | if (S_ISDIR(ecryptfs_inode->i_mode)) { |
229 | ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); | 230 | ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); |
230 | crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); | 231 | crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); |
231 | goto out; | 232 | goto out; |
232 | } | 233 | } |
233 | ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); | 234 | ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); |
234 | rc = ecryptfs_new_file_context(ecryptfs_dentry); | 235 | rc = ecryptfs_new_file_context(ecryptfs_inode); |
235 | if (rc) { | 236 | if (rc) { |
236 | ecryptfs_printk(KERN_ERR, "Error creating new file " | 237 | ecryptfs_printk(KERN_ERR, "Error creating new file " |
237 | "context; rc = [%d]\n", rc); | 238 | "context; rc = [%d]\n", rc); |
238 | goto out; | 239 | goto out; |
239 | } | 240 | } |
240 | rc = ecryptfs_get_lower_file(ecryptfs_dentry, | 241 | rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); |
241 | ecryptfs_dentry->d_inode); | ||
242 | if (rc) { | 242 | if (rc) { |
243 | printk(KERN_ERR "%s: Error attempting to initialize " | 243 | printk(KERN_ERR "%s: Error attempting to initialize " |
244 | "the lower file for the dentry with name " | 244 | "the lower file for the dentry with name " |
@@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) | |||
246 | ecryptfs_dentry->d_name.name, rc); | 246 | ecryptfs_dentry->d_name.name, rc); |
247 | goto out; | 247 | goto out; |
248 | } | 248 | } |
249 | rc = ecryptfs_write_metadata(ecryptfs_dentry); | 249 | rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); |
250 | if (rc) | 250 | if (rc) |
251 | printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); | 251 | printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); |
252 | ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); | 252 | ecryptfs_put_lower_file(ecryptfs_inode); |
253 | out: | 253 | out: |
254 | return rc; | 254 | return rc; |
255 | } | 255 | } |
@@ -269,18 +269,28 @@ static int | |||
269 | ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, | 269 | ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, |
270 | int mode, struct nameidata *nd) | 270 | int mode, struct nameidata *nd) |
271 | { | 271 | { |
272 | struct inode *ecryptfs_inode; | ||
272 | int rc; | 273 | int rc; |
273 | 274 | ||
274 | /* ecryptfs_do_create() calls ecryptfs_interpose() */ | 275 | ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, |
275 | rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); | 276 | mode); |
276 | if (unlikely(rc)) { | 277 | if (unlikely(IS_ERR(ecryptfs_inode))) { |
277 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" | 278 | ecryptfs_printk(KERN_WARNING, "Failed to create file in" |
278 | "lower filesystem\n"); | 279 | "lower filesystem\n"); |
280 | rc = PTR_ERR(ecryptfs_inode); | ||
279 | goto out; | 281 | goto out; |
280 | } | 282 | } |
281 | /* At this point, a file exists on "disk"; we need to make sure | 283 | /* At this point, a file exists on "disk"; we need to make sure |
282 | * that this on disk file is prepared to be an ecryptfs file */ | 284 | * that this on disk file is prepared to be an ecryptfs file */ |
283 | rc = ecryptfs_initialize_file(ecryptfs_dentry); | 285 | rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); |
286 | if (rc) { | ||
287 | drop_nlink(ecryptfs_inode); | ||
288 | unlock_new_inode(ecryptfs_inode); | ||
289 | iput(ecryptfs_inode); | ||
290 | goto out; | ||
291 | } | ||
292 | d_instantiate(ecryptfs_dentry, ecryptfs_inode); | ||
293 | unlock_new_inode(ecryptfs_inode); | ||
284 | out: | 294 | out: |
285 | return rc; | 295 | return rc; |
286 | } | 296 | } |
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index fa9a286c877..da42f32c49b 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig | |||
@@ -5,7 +5,7 @@ | |||
5 | # selected by any of the users. | 5 | # selected by any of the users. |
6 | config ORE | 6 | config ORE |
7 | tristate | 7 | tristate |
8 | depends on EXOFS_FS | 8 | depends on EXOFS_FS || PNFS_OBJLAYOUT |
9 | select ASYNC_XOR | 9 | select ASYNC_XOR |
10 | default SCSI_OSD_ULD | 10 | default SCSI_OSD_ULD |
11 | 11 | ||
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index fcfa86ae6fa..d271ad83720 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c | |||
@@ -23,6 +23,7 @@ | |||
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/module.h> | ||
26 | #include <asm/div64.h> | 27 | #include <asm/div64.h> |
27 | #include <linux/lcm.h> | 28 | #include <linux/lcm.h> |
28 | 29 | ||
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 057b237b8b6..e6085ec192d 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/parser.h> | 35 | #include <linux/parser.h> |
36 | #include <linux/vfs.h> | 36 | #include <linux/vfs.h> |
37 | #include <linux/random.h> | 37 | #include <linux/random.h> |
38 | #include <linux/module.h> | ||
38 | #include <linux/exportfs.h> | 39 | #include <linux/exportfs.h> |
39 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
40 | 41 | ||
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f6dba4505f1..12ccacda44e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) | |||
565 | brelse(bitmap_bh); | 565 | brelse(bitmap_bh); |
566 | printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" | 566 | printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" |
567 | ", computed = %llu, %llu\n", | 567 | ", computed = %llu, %llu\n", |
568 | EXT4_B2C(sbi, ext4_free_blocks_count(es)), | 568 | EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), |
569 | desc_count, bitmap_count); | 569 | desc_count, bitmap_count); |
570 | return bitmap_count; | 570 | return bitmap_count; |
571 | #else | 571 | #else |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cc5a6da030a..848f436df29 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2270,6 +2270,7 @@ retry: | |||
2270 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " | 2270 | ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " |
2271 | "%ld pages, ino %lu; err %d", __func__, | 2271 | "%ld pages, ino %lu; err %d", __func__, |
2272 | wbc->nr_to_write, inode->i_ino, ret); | 2272 | wbc->nr_to_write, inode->i_ino, ret); |
2273 | blk_finish_plug(&plug); | ||
2273 | goto out_writepages; | 2274 | goto out_writepages; |
2274 | } | 2275 | } |
2275 | 2276 | ||
@@ -2372,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb) | |||
2372 | * start pushing delalloc when 1/2 of free blocks are dirty. | 2373 | * start pushing delalloc when 1/2 of free blocks are dirty. |
2373 | */ | 2374 | */ |
2374 | if (free_blocks < 2 * dirty_blocks) | 2375 | if (free_blocks < 2 * dirty_blocks) |
2375 | writeback_inodes_sb_if_idle(sb); | 2376 | writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE); |
2376 | 2377 | ||
2377 | return 0; | 2378 | return 0; |
2378 | } | 2379 | } |
@@ -2806,8 +2807,8 @@ out: | |||
2806 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 2807 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
2807 | 2808 | ||
2808 | /* queue the work to convert unwritten extents to written */ | 2809 | /* queue the work to convert unwritten extents to written */ |
2809 | queue_work(wq, &io_end->work); | ||
2810 | iocb->private = NULL; | 2810 | iocb->private = NULL; |
2811 | queue_work(wq, &io_end->work); | ||
2811 | 2812 | ||
2812 | /* XXX: probably should move into the real I/O completion handler */ | 2813 | /* XXX: probably should move into the real I/O completion handler */ |
2813 | inode_dio_done(inode); | 2814 | inode_dio_done(inode); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80145a..3858767ec67 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb, | |||
1683 | data_opt = EXT4_MOUNT_WRITEBACK_DATA; | 1683 | data_opt = EXT4_MOUNT_WRITEBACK_DATA; |
1684 | datacheck: | 1684 | datacheck: |
1685 | if (is_remount) { | 1685 | if (is_remount) { |
1686 | if (test_opt(sb, DATA_FLAGS) != data_opt) { | 1686 | if (!sbi->s_journal) |
1687 | ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); | ||
1688 | else if (test_opt(sb, DATA_FLAGS) != data_opt) { | ||
1687 | ext4_msg(sb, KERN_ERR, | 1689 | ext4_msg(sb, KERN_ERR, |
1688 | "Cannot change data mode on remount"); | 1690 | "Cannot change data mode on remount"); |
1689 | return 0; | 1691 | return 0; |
@@ -3099,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void) | |||
3099 | } | 3101 | } |
3100 | 3102 | ||
3101 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) | 3103 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
3102 | __releases(kernel_lock) | ||
3103 | __acquires(kernel_lock) | ||
3104 | { | 3104 | { |
3105 | char *orig_data = kstrdup(data, GFP_KERNEL); | 3105 | char *orig_data = kstrdup(data, GFP_KERNEL); |
3106 | struct buffer_head *bh; | 3106 | struct buffer_head *bh; |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 04cf3b91e50..ac86f8b3e3c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -41,11 +41,23 @@ struct wb_writeback_work { | |||
41 | unsigned int for_kupdate:1; | 41 | unsigned int for_kupdate:1; |
42 | unsigned int range_cyclic:1; | 42 | unsigned int range_cyclic:1; |
43 | unsigned int for_background:1; | 43 | unsigned int for_background:1; |
44 | enum wb_reason reason; /* why was writeback initiated? */ | ||
44 | 45 | ||
45 | struct list_head list; /* pending work list */ | 46 | struct list_head list; /* pending work list */ |
46 | struct completion *done; /* set if the caller waits */ | 47 | struct completion *done; /* set if the caller waits */ |
47 | }; | 48 | }; |
48 | 49 | ||
50 | const char *wb_reason_name[] = { | ||
51 | [WB_REASON_BACKGROUND] = "background", | ||
52 | [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", | ||
53 | [WB_REASON_SYNC] = "sync", | ||
54 | [WB_REASON_PERIODIC] = "periodic", | ||
55 | [WB_REASON_LAPTOP_TIMER] = "laptop_timer", | ||
56 | [WB_REASON_FREE_MORE_MEM] = "free_more_memory", | ||
57 | [WB_REASON_FS_FREE_SPACE] = "fs_free_space", | ||
58 | [WB_REASON_FORKER_THREAD] = "forker_thread" | ||
59 | }; | ||
60 | |||
49 | /* | 61 | /* |
50 | * Include the creation of the trace points after defining the | 62 | * Include the creation of the trace points after defining the |
51 | * wb_writeback_work structure so that the definition remains local to this | 63 | * wb_writeback_work structure so that the definition remains local to this |
@@ -115,7 +127,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, | |||
115 | 127 | ||
116 | static void | 128 | static void |
117 | __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | 129 | __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, |
118 | bool range_cyclic) | 130 | bool range_cyclic, enum wb_reason reason) |
119 | { | 131 | { |
120 | struct wb_writeback_work *work; | 132 | struct wb_writeback_work *work; |
121 | 133 | ||
@@ -135,6 +147,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
135 | work->sync_mode = WB_SYNC_NONE; | 147 | work->sync_mode = WB_SYNC_NONE; |
136 | work->nr_pages = nr_pages; | 148 | work->nr_pages = nr_pages; |
137 | work->range_cyclic = range_cyclic; | 149 | work->range_cyclic = range_cyclic; |
150 | work->reason = reason; | ||
138 | 151 | ||
139 | bdi_queue_work(bdi, work); | 152 | bdi_queue_work(bdi, work); |
140 | } | 153 | } |
@@ -143,6 +156,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
143 | * bdi_start_writeback - start writeback | 156 | * bdi_start_writeback - start writeback |
144 | * @bdi: the backing device to write from | 157 | * @bdi: the backing device to write from |
145 | * @nr_pages: the number of pages to write | 158 | * @nr_pages: the number of pages to write |
159 | * @reason: reason why some writeback work was initiated | ||
146 | * | 160 | * |
147 | * Description: | 161 | * Description: |
148 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only | 162 | * This does WB_SYNC_NONE opportunistic writeback. The IO is only |
@@ -150,9 +164,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, | |||
150 | * completion. Caller need not hold sb s_umount semaphore. | 164 | * completion. Caller need not hold sb s_umount semaphore. |
151 | * | 165 | * |
152 | */ | 166 | */ |
153 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) | 167 | void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, |
168 | enum wb_reason reason) | ||
154 | { | 169 | { |
155 | __bdi_start_writeback(bdi, nr_pages, true); | 170 | __bdi_start_writeback(bdi, nr_pages, true, reason); |
156 | } | 171 | } |
157 | 172 | ||
158 | /** | 173 | /** |
@@ -251,7 +266,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) | |||
251 | */ | 266 | */ |
252 | static int move_expired_inodes(struct list_head *delaying_queue, | 267 | static int move_expired_inodes(struct list_head *delaying_queue, |
253 | struct list_head *dispatch_queue, | 268 | struct list_head *dispatch_queue, |
254 | unsigned long *older_than_this) | 269 | struct wb_writeback_work *work) |
255 | { | 270 | { |
256 | LIST_HEAD(tmp); | 271 | LIST_HEAD(tmp); |
257 | struct list_head *pos, *node; | 272 | struct list_head *pos, *node; |
@@ -262,8 +277,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, | |||
262 | 277 | ||
263 | while (!list_empty(delaying_queue)) { | 278 | while (!list_empty(delaying_queue)) { |
264 | inode = wb_inode(delaying_queue->prev); | 279 | inode = wb_inode(delaying_queue->prev); |
265 | if (older_than_this && | 280 | if (work->older_than_this && |
266 | inode_dirtied_after(inode, *older_than_this)) | 281 | inode_dirtied_after(inode, *work->older_than_this)) |
267 | break; | 282 | break; |
268 | if (sb && sb != inode->i_sb) | 283 | if (sb && sb != inode->i_sb) |
269 | do_sb_sort = 1; | 284 | do_sb_sort = 1; |
@@ -302,13 +317,13 @@ out: | |||
302 | * | | 317 | * | |
303 | * +--> dequeue for IO | 318 | * +--> dequeue for IO |
304 | */ | 319 | */ |
305 | static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) | 320 | static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) |
306 | { | 321 | { |
307 | int moved; | 322 | int moved; |
308 | assert_spin_locked(&wb->list_lock); | 323 | assert_spin_locked(&wb->list_lock); |
309 | list_splice_init(&wb->b_more_io, &wb->b_io); | 324 | list_splice_init(&wb->b_more_io, &wb->b_io); |
310 | moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); | 325 | moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); |
311 | trace_writeback_queue_io(wb, older_than_this, moved); | 326 | trace_writeback_queue_io(wb, work, moved); |
312 | } | 327 | } |
313 | 328 | ||
314 | static int write_inode(struct inode *inode, struct writeback_control *wbc) | 329 | static int write_inode(struct inode *inode, struct writeback_control *wbc) |
@@ -641,31 +656,40 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, | |||
641 | return wrote; | 656 | return wrote; |
642 | } | 657 | } |
643 | 658 | ||
644 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages) | 659 | long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, |
660 | enum wb_reason reason) | ||
645 | { | 661 | { |
646 | struct wb_writeback_work work = { | 662 | struct wb_writeback_work work = { |
647 | .nr_pages = nr_pages, | 663 | .nr_pages = nr_pages, |
648 | .sync_mode = WB_SYNC_NONE, | 664 | .sync_mode = WB_SYNC_NONE, |
649 | .range_cyclic = 1, | 665 | .range_cyclic = 1, |
666 | .reason = reason, | ||
650 | }; | 667 | }; |
651 | 668 | ||
652 | spin_lock(&wb->list_lock); | 669 | spin_lock(&wb->list_lock); |
653 | if (list_empty(&wb->b_io)) | 670 | if (list_empty(&wb->b_io)) |
654 | queue_io(wb, NULL); | 671 | queue_io(wb, &work); |
655 | __writeback_inodes_wb(wb, &work); | 672 | __writeback_inodes_wb(wb, &work); |
656 | spin_unlock(&wb->list_lock); | 673 | spin_unlock(&wb->list_lock); |
657 | 674 | ||
658 | return nr_pages - work.nr_pages; | 675 | return nr_pages - work.nr_pages; |
659 | } | 676 | } |
660 | 677 | ||
661 | static inline bool over_bground_thresh(void) | 678 | static bool over_bground_thresh(struct backing_dev_info *bdi) |
662 | { | 679 | { |
663 | unsigned long background_thresh, dirty_thresh; | 680 | unsigned long background_thresh, dirty_thresh; |
664 | 681 | ||
665 | global_dirty_limits(&background_thresh, &dirty_thresh); | 682 | global_dirty_limits(&background_thresh, &dirty_thresh); |
666 | 683 | ||
667 | return (global_page_state(NR_FILE_DIRTY) + | 684 | if (global_page_state(NR_FILE_DIRTY) + |
668 | global_page_state(NR_UNSTABLE_NFS) > background_thresh); | 685 | global_page_state(NR_UNSTABLE_NFS) > background_thresh) |
686 | return true; | ||
687 | |||
688 | if (bdi_stat(bdi, BDI_RECLAIMABLE) > | ||
689 | bdi_dirty_limit(bdi, background_thresh)) | ||
690 | return true; | ||
691 | |||
692 | return false; | ||
669 | } | 693 | } |
670 | 694 | ||
671 | /* | 695 | /* |
@@ -675,7 +699,7 @@ static inline bool over_bground_thresh(void) | |||
675 | static void wb_update_bandwidth(struct bdi_writeback *wb, | 699 | static void wb_update_bandwidth(struct bdi_writeback *wb, |
676 | unsigned long start_time) | 700 | unsigned long start_time) |
677 | { | 701 | { |
678 | __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time); | 702 | __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); |
679 | } | 703 | } |
680 | 704 | ||
681 | /* | 705 | /* |
@@ -727,7 +751,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
727 | * For background writeout, stop when we are below the | 751 | * For background writeout, stop when we are below the |
728 | * background dirty threshold | 752 | * background dirty threshold |
729 | */ | 753 | */ |
730 | if (work->for_background && !over_bground_thresh()) | 754 | if (work->for_background && !over_bground_thresh(wb->bdi)) |
731 | break; | 755 | break; |
732 | 756 | ||
733 | if (work->for_kupdate) { | 757 | if (work->for_kupdate) { |
@@ -738,7 +762,7 @@ static long wb_writeback(struct bdi_writeback *wb, | |||
738 | 762 | ||
739 | trace_writeback_start(wb->bdi, work); | 763 | trace_writeback_start(wb->bdi, work); |
740 | if (list_empty(&wb->b_io)) | 764 | if (list_empty(&wb->b_io)) |
741 | queue_io(wb, work->older_than_this); | 765 | queue_io(wb, work); |
742 | if (work->sb) | 766 | if (work->sb) |
743 | progress = writeback_sb_inodes(work->sb, wb, work); | 767 | progress = writeback_sb_inodes(work->sb, wb, work); |
744 | else | 768 | else |
@@ -811,13 +835,14 @@ static unsigned long get_nr_dirty_pages(void) | |||
811 | 835 | ||
812 | static long wb_check_background_flush(struct bdi_writeback *wb) | 836 | static long wb_check_background_flush(struct bdi_writeback *wb) |
813 | { | 837 | { |
814 | if (over_bground_thresh()) { | 838 | if (over_bground_thresh(wb->bdi)) { |
815 | 839 | ||
816 | struct wb_writeback_work work = { | 840 | struct wb_writeback_work work = { |
817 | .nr_pages = LONG_MAX, | 841 | .nr_pages = LONG_MAX, |
818 | .sync_mode = WB_SYNC_NONE, | 842 | .sync_mode = WB_SYNC_NONE, |
819 | .for_background = 1, | 843 | .for_background = 1, |
820 | .range_cyclic = 1, | 844 | .range_cyclic = 1, |
845 | .reason = WB_REASON_BACKGROUND, | ||
821 | }; | 846 | }; |
822 | 847 | ||
823 | return wb_writeback(wb, &work); | 848 | return wb_writeback(wb, &work); |
@@ -851,6 +876,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) | |||
851 | .sync_mode = WB_SYNC_NONE, | 876 | .sync_mode = WB_SYNC_NONE, |
852 | .for_kupdate = 1, | 877 | .for_kupdate = 1, |
853 | .range_cyclic = 1, | 878 | .range_cyclic = 1, |
879 | .reason = WB_REASON_PERIODIC, | ||
854 | }; | 880 | }; |
855 | 881 | ||
856 | return wb_writeback(wb, &work); | 882 | return wb_writeback(wb, &work); |
@@ -969,7 +995,7 @@ int bdi_writeback_thread(void *data) | |||
969 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | 995 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back |
970 | * the whole world. | 996 | * the whole world. |
971 | */ | 997 | */ |
972 | void wakeup_flusher_threads(long nr_pages) | 998 | void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) |
973 | { | 999 | { |
974 | struct backing_dev_info *bdi; | 1000 | struct backing_dev_info *bdi; |
975 | 1001 | ||
@@ -982,7 +1008,7 @@ void wakeup_flusher_threads(long nr_pages) | |||
982 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { | 1008 | list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { |
983 | if (!bdi_has_dirty_io(bdi)) | 1009 | if (!bdi_has_dirty_io(bdi)) |
984 | continue; | 1010 | continue; |
985 | __bdi_start_writeback(bdi, nr_pages, false); | 1011 | __bdi_start_writeback(bdi, nr_pages, false, reason); |
986 | } | 1012 | } |
987 | rcu_read_unlock(); | 1013 | rcu_read_unlock(); |
988 | } | 1014 | } |
@@ -1198,12 +1224,15 @@ static void wait_sb_inodes(struct super_block *sb) | |||
1198 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block | 1224 | * writeback_inodes_sb_nr - writeback dirty inodes from given super_block |
1199 | * @sb: the superblock | 1225 | * @sb: the superblock |
1200 | * @nr: the number of pages to write | 1226 | * @nr: the number of pages to write |
1227 | * @reason: reason why some writeback work initiated | ||
1201 | * | 1228 | * |
1202 | * Start writeback on some inodes on this super_block. No guarantees are made | 1229 | * Start writeback on some inodes on this super_block. No guarantees are made |
1203 | * on how many (if any) will be written, and this function does not wait | 1230 | * on how many (if any) will be written, and this function does not wait |
1204 | * for IO completion of submitted IO. | 1231 | * for IO completion of submitted IO. |
1205 | */ | 1232 | */ |
1206 | void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) | 1233 | void writeback_inodes_sb_nr(struct super_block *sb, |
1234 | unsigned long nr, | ||
1235 | enum wb_reason reason) | ||
1207 | { | 1236 | { |
1208 | DECLARE_COMPLETION_ONSTACK(done); | 1237 | DECLARE_COMPLETION_ONSTACK(done); |
1209 | struct wb_writeback_work work = { | 1238 | struct wb_writeback_work work = { |
@@ -1212,6 +1241,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) | |||
1212 | .tagged_writepages = 1, | 1241 | .tagged_writepages = 1, |
1213 | .done = &done, | 1242 | .done = &done, |
1214 | .nr_pages = nr, | 1243 | .nr_pages = nr, |
1244 | .reason = reason, | ||
1215 | }; | 1245 | }; |
1216 | 1246 | ||
1217 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1247 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
@@ -1223,29 +1253,31 @@ EXPORT_SYMBOL(writeback_inodes_sb_nr); | |||
1223 | /** | 1253 | /** |
1224 | * writeback_inodes_sb - writeback dirty inodes from given super_block | 1254 | * writeback_inodes_sb - writeback dirty inodes from given super_block |
1225 | * @sb: the superblock | 1255 | * @sb: the superblock |
1256 | * @reason: reason why some writeback work was initiated | ||
1226 | * | 1257 | * |
1227 | * Start writeback on some inodes on this super_block. No guarantees are made | 1258 | * Start writeback on some inodes on this super_block. No guarantees are made |
1228 | * on how many (if any) will be written, and this function does not wait | 1259 | * on how many (if any) will be written, and this function does not wait |
1229 | * for IO completion of submitted IO. | 1260 | * for IO completion of submitted IO. |
1230 | */ | 1261 | */ |
1231 | void writeback_inodes_sb(struct super_block *sb) | 1262 | void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) |
1232 | { | 1263 | { |
1233 | return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); | 1264 | return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); |
1234 | } | 1265 | } |
1235 | EXPORT_SYMBOL(writeback_inodes_sb); | 1266 | EXPORT_SYMBOL(writeback_inodes_sb); |
1236 | 1267 | ||
1237 | /** | 1268 | /** |
1238 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1269 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1239 | * @sb: the superblock | 1270 | * @sb: the superblock |
1271 | * @reason: reason why some writeback work was initiated | ||
1240 | * | 1272 | * |
1241 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1273 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1242 | * Returns 1 if writeback was started, 0 if not. | 1274 | * Returns 1 if writeback was started, 0 if not. |
1243 | */ | 1275 | */ |
1244 | int writeback_inodes_sb_if_idle(struct super_block *sb) | 1276 | int writeback_inodes_sb_if_idle(struct super_block *sb, enum wb_reason reason) |
1245 | { | 1277 | { |
1246 | if (!writeback_in_progress(sb->s_bdi)) { | 1278 | if (!writeback_in_progress(sb->s_bdi)) { |
1247 | down_read(&sb->s_umount); | 1279 | down_read(&sb->s_umount); |
1248 | writeback_inodes_sb(sb); | 1280 | writeback_inodes_sb(sb, reason); |
1249 | up_read(&sb->s_umount); | 1281 | up_read(&sb->s_umount); |
1250 | return 1; | 1282 | return 1; |
1251 | } else | 1283 | } else |
@@ -1257,16 +1289,18 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle); | |||
1257 | * writeback_inodes_sb_if_idle - start writeback if none underway | 1289 | * writeback_inodes_sb_if_idle - start writeback if none underway |
1258 | * @sb: the superblock | 1290 | * @sb: the superblock |
1259 | * @nr: the number of pages to write | 1291 | * @nr: the number of pages to write |
1292 | * @reason: reason why some writeback work was initiated | ||
1260 | * | 1293 | * |
1261 | * Invoke writeback_inodes_sb if no writeback is currently underway. | 1294 | * Invoke writeback_inodes_sb if no writeback is currently underway. |
1262 | * Returns 1 if writeback was started, 0 if not. | 1295 | * Returns 1 if writeback was started, 0 if not. |
1263 | */ | 1296 | */ |
1264 | int writeback_inodes_sb_nr_if_idle(struct super_block *sb, | 1297 | int writeback_inodes_sb_nr_if_idle(struct super_block *sb, |
1265 | unsigned long nr) | 1298 | unsigned long nr, |
1299 | enum wb_reason reason) | ||
1266 | { | 1300 | { |
1267 | if (!writeback_in_progress(sb->s_bdi)) { | 1301 | if (!writeback_in_progress(sb->s_bdi)) { |
1268 | down_read(&sb->s_umount); | 1302 | down_read(&sb->s_umount); |
1269 | writeback_inodes_sb_nr(sb, nr); | 1303 | writeback_inodes_sb_nr(sb, nr, reason); |
1270 | up_read(&sb->s_umount); | 1304 | up_read(&sb->s_umount); |
1271 | return 1; | 1305 | return 1; |
1272 | } else | 1306 | } else |
@@ -1290,6 +1324,7 @@ void sync_inodes_sb(struct super_block *sb) | |||
1290 | .nr_pages = LONG_MAX, | 1324 | .nr_pages = LONG_MAX, |
1291 | .range_cyclic = 0, | 1325 | .range_cyclic = 0, |
1292 | .done = &done, | 1326 | .done = &done, |
1327 | .reason = WB_REASON_SYNC, | ||
1293 | }; | 1328 | }; |
1294 | 1329 | ||
1295 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 1330 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cca47f7b0..3426521f320 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/slab.h> | 47 | #include <linux/slab.h> |
48 | #include <linux/spinlock.h> | 48 | #include <linux/spinlock.h> |
49 | #include <linux/stat.h> | 49 | #include <linux/stat.h> |
50 | #include <linux/module.h> | ||
50 | 51 | ||
51 | #include "fuse_i.h" | 52 | #include "fuse_i.h" |
52 | 53 | ||
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7e823bbd245..cb23c2be731 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/buffer_head.h> | 14 | #include <linux/buffer_head.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/kthread.h> | 16 | #include <linux/kthread.h> |
17 | #include <linux/export.h> | ||
17 | #include <linux/namei.h> | 18 | #include <linux/namei.h> |
18 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
19 | #include <linux/gfs2_ondisk.h> | 20 | #include <linux/gfs2_ondisk.h> |
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index e673a88b8ae..b1ce4c7ad3f 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c | |||
@@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) | |||
40 | 40 | ||
41 | src = in->name; | 41 | src = in->name; |
42 | srclen = in->len; | 42 | srclen = in->len; |
43 | if (srclen > HFS_NAMELEN) | ||
44 | srclen = HFS_NAMELEN; | ||
43 | dst = out; | 45 | dst = out; |
44 | dstlen = HFS_MAX_NAMELEN; | 46 | dstlen = HFS_MAX_NAMELEN; |
45 | if (nls_io) { | 47 | if (nls_io) { |
diff --git a/fs/ioprio.c b/fs/ioprio.c index 7da2a06508e..f79dab83e17 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c | |||
@@ -21,6 +21,7 @@ | |||
21 | */ | 21 | */ |
22 | #include <linux/gfp.h> | 22 | #include <linux/gfp.h> |
23 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
24 | #include <linux/export.h> | ||
24 | #include <linux/ioprio.h> | 25 | #include <linux/ioprio.h> |
25 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
26 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index de4247021d2..5b6c9d1a2fb 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c | |||
@@ -53,6 +53,78 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this, | |||
53 | return 0; | 53 | return 0; |
54 | } | 54 | } |
55 | 55 | ||
56 | /* | ||
57 | * jffs2_selected_compress: | ||
58 | * @compr: Explicit compression type to use (ie, JFFS2_COMPR_ZLIB). | ||
59 | * If 0, just take the first available compression mode. | ||
60 | * @data_in: Pointer to uncompressed data | ||
61 | * @cpage_out: Pointer to returned pointer to buffer for compressed data | ||
62 | * @datalen: On entry, holds the amount of data available for compression. | ||
63 | * On exit, expected to hold the amount of data actually compressed. | ||
64 | * @cdatalen: On entry, holds the amount of space available for compressed | ||
65 | * data. On exit, expected to hold the actual size of the compressed | ||
66 | * data. | ||
67 | * | ||
68 | * Returns: the compression type used. Zero is used to show that the data | ||
69 | * could not be compressed; probably because we couldn't find the requested | ||
70 | * compression mode. | ||
71 | */ | ||
72 | static int jffs2_selected_compress(u8 compr, unsigned char *data_in, | ||
73 | unsigned char **cpage_out, u32 *datalen, u32 *cdatalen) | ||
74 | { | ||
75 | struct jffs2_compressor *this; | ||
76 | int err, ret = JFFS2_COMPR_NONE; | ||
77 | uint32_t orig_slen, orig_dlen; | ||
78 | char *output_buf; | ||
79 | |||
80 | output_buf = kmalloc(*cdatalen, GFP_KERNEL); | ||
81 | if (!output_buf) { | ||
82 | printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); | ||
83 | return ret; | ||
84 | } | ||
85 | orig_slen = *datalen; | ||
86 | orig_dlen = *cdatalen; | ||
87 | spin_lock(&jffs2_compressor_list_lock); | ||
88 | list_for_each_entry(this, &jffs2_compressor_list, list) { | ||
89 | /* Skip decompress-only and disabled modules */ | ||
90 | if (!this->compress || this->disabled) | ||
91 | continue; | ||
92 | |||
93 | /* Skip if not the desired compression type */ | ||
94 | if (compr && (compr != this->compr)) | ||
95 | continue; | ||
96 | |||
97 | /* | ||
98 | * Either compression type was unspecified, or we found our | ||
99 | * compressor; either way, we're good to go. | ||
100 | */ | ||
101 | this->usecount++; | ||
102 | spin_unlock(&jffs2_compressor_list_lock); | ||
103 | |||
104 | *datalen = orig_slen; | ||
105 | *cdatalen = orig_dlen; | ||
106 | err = this->compress(data_in, output_buf, datalen, cdatalen); | ||
107 | |||
108 | spin_lock(&jffs2_compressor_list_lock); | ||
109 | this->usecount--; | ||
110 | if (!err) { | ||
111 | /* Success */ | ||
112 | ret = this->compr; | ||
113 | this->stat_compr_blocks++; | ||
114 | this->stat_compr_orig_size += *datalen; | ||
115 | this->stat_compr_new_size += *cdatalen; | ||
116 | break; | ||
117 | } | ||
118 | } | ||
119 | spin_unlock(&jffs2_compressor_list_lock); | ||
120 | if (ret == JFFS2_COMPR_NONE) | ||
121 | kfree(output_buf); | ||
122 | else | ||
123 | *cpage_out = output_buf; | ||
124 | |||
125 | return ret; | ||
126 | } | ||
127 | |||
56 | /* jffs2_compress: | 128 | /* jffs2_compress: |
57 | * @data_in: Pointer to uncompressed data | 129 | * @data_in: Pointer to uncompressed data |
58 | * @cpage_out: Pointer to returned pointer to buffer for compressed data | 130 | * @cpage_out: Pointer to returned pointer to buffer for compressed data |
@@ -76,47 +148,23 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, | |||
76 | uint32_t *datalen, uint32_t *cdatalen) | 148 | uint32_t *datalen, uint32_t *cdatalen) |
77 | { | 149 | { |
78 | int ret = JFFS2_COMPR_NONE; | 150 | int ret = JFFS2_COMPR_NONE; |
79 | int compr_ret; | 151 | int mode, compr_ret; |
80 | struct jffs2_compressor *this, *best=NULL; | 152 | struct jffs2_compressor *this, *best=NULL; |
81 | unsigned char *output_buf = NULL, *tmp_buf; | 153 | unsigned char *output_buf = NULL, *tmp_buf; |
82 | uint32_t orig_slen, orig_dlen; | 154 | uint32_t orig_slen, orig_dlen; |
83 | uint32_t best_slen=0, best_dlen=0; | 155 | uint32_t best_slen=0, best_dlen=0; |
84 | 156 | ||
85 | switch (jffs2_compression_mode) { | 157 | if (c->mount_opts.override_compr) |
158 | mode = c->mount_opts.compr; | ||
159 | else | ||
160 | mode = jffs2_compression_mode; | ||
161 | |||
162 | switch (mode) { | ||
86 | case JFFS2_COMPR_MODE_NONE: | 163 | case JFFS2_COMPR_MODE_NONE: |
87 | break; | 164 | break; |
88 | case JFFS2_COMPR_MODE_PRIORITY: | 165 | case JFFS2_COMPR_MODE_PRIORITY: |
89 | output_buf = kmalloc(*cdatalen,GFP_KERNEL); | 166 | ret = jffs2_selected_compress(0, data_in, cpage_out, datalen, |
90 | if (!output_buf) { | 167 | cdatalen); |
91 | printk(KERN_WARNING "JFFS2: No memory for compressor allocation. Compression failed.\n"); | ||
92 | goto out; | ||
93 | } | ||
94 | orig_slen = *datalen; | ||
95 | orig_dlen = *cdatalen; | ||
96 | spin_lock(&jffs2_compressor_list_lock); | ||
97 | list_for_each_entry(this, &jffs2_compressor_list, list) { | ||
98 | /* Skip decompress-only backwards-compatibility and disabled modules */ | ||
99 | if ((!this->compress)||(this->disabled)) | ||
100 | continue; | ||
101 | |||
102 | this->usecount++; | ||
103 | spin_unlock(&jffs2_compressor_list_lock); | ||
104 | *datalen = orig_slen; | ||
105 | *cdatalen = orig_dlen; | ||
106 | compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); | ||
107 | spin_lock(&jffs2_compressor_list_lock); | ||
108 | this->usecount--; | ||
109 | if (!compr_ret) { | ||
110 | ret = this->compr; | ||
111 | this->stat_compr_blocks++; | ||
112 | this->stat_compr_orig_size += *datalen; | ||
113 | this->stat_compr_new_size += *cdatalen; | ||
114 | break; | ||
115 | } | ||
116 | } | ||
117 | spin_unlock(&jffs2_compressor_list_lock); | ||
118 | if (ret == JFFS2_COMPR_NONE) | ||
119 | kfree(output_buf); | ||
120 | break; | 168 | break; |
121 | case JFFS2_COMPR_MODE_SIZE: | 169 | case JFFS2_COMPR_MODE_SIZE: |
122 | case JFFS2_COMPR_MODE_FAVOURLZO: | 170 | case JFFS2_COMPR_MODE_FAVOURLZO: |
@@ -174,22 +222,28 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, | |||
174 | best->stat_compr_orig_size += best_slen; | 222 | best->stat_compr_orig_size += best_slen; |
175 | best->stat_compr_new_size += best_dlen; | 223 | best->stat_compr_new_size += best_dlen; |
176 | ret = best->compr; | 224 | ret = best->compr; |
225 | *cpage_out = output_buf; | ||
177 | } | 226 | } |
178 | spin_unlock(&jffs2_compressor_list_lock); | 227 | spin_unlock(&jffs2_compressor_list_lock); |
179 | break; | 228 | break; |
229 | case JFFS2_COMPR_MODE_FORCELZO: | ||
230 | ret = jffs2_selected_compress(JFFS2_COMPR_LZO, data_in, | ||
231 | cpage_out, datalen, cdatalen); | ||
232 | break; | ||
233 | case JFFS2_COMPR_MODE_FORCEZLIB: | ||
234 | ret = jffs2_selected_compress(JFFS2_COMPR_ZLIB, data_in, | ||
235 | cpage_out, datalen, cdatalen); | ||
236 | break; | ||
180 | default: | 237 | default: |
181 | printk(KERN_ERR "JFFS2: unknown compression mode.\n"); | 238 | printk(KERN_ERR "JFFS2: unknown compression mode.\n"); |
182 | } | 239 | } |
183 | out: | 240 | |
184 | if (ret == JFFS2_COMPR_NONE) { | 241 | if (ret == JFFS2_COMPR_NONE) { |
185 | *cpage_out = data_in; | 242 | *cpage_out = data_in; |
186 | *datalen = *cdatalen; | 243 | *datalen = *cdatalen; |
187 | none_stat_compr_blocks++; | 244 | none_stat_compr_blocks++; |
188 | none_stat_compr_size += *datalen; | 245 | none_stat_compr_size += *datalen; |
189 | } | 246 | } |
190 | else { | ||
191 | *cpage_out = output_buf; | ||
192 | } | ||
193 | return ret; | 247 | return ret; |
194 | } | 248 | } |
195 | 249 | ||
diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index 13bb7597ab3..5e91d578f4e 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h | |||
@@ -40,6 +40,8 @@ | |||
40 | #define JFFS2_COMPR_MODE_PRIORITY 1 | 40 | #define JFFS2_COMPR_MODE_PRIORITY 1 |
41 | #define JFFS2_COMPR_MODE_SIZE 2 | 41 | #define JFFS2_COMPR_MODE_SIZE 2 |
42 | #define JFFS2_COMPR_MODE_FAVOURLZO 3 | 42 | #define JFFS2_COMPR_MODE_FAVOURLZO 3 |
43 | #define JFFS2_COMPR_MODE_FORCELZO 4 | ||
44 | #define JFFS2_COMPR_MODE_FORCEZLIB 5 | ||
43 | 45 | ||
44 | #define FAVOUR_LZO_PERCENT 80 | 46 | #define FAVOUR_LZO_PERCENT 80 |
45 | 47 | ||
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 7286e44ac66..4b8afe39a87 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c | |||
@@ -379,7 +379,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) | |||
379 | jffs2_do_setattr(inode, &iattr); | 379 | jffs2_do_setattr(inode, &iattr); |
380 | } | 380 | } |
381 | 381 | ||
382 | int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) | 382 | int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) |
383 | { | 383 | { |
384 | struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); | 384 | struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); |
385 | 385 | ||
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 0bc6a6c80a5..55a0c1dcead 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h | |||
@@ -29,6 +29,11 @@ | |||
29 | 29 | ||
30 | struct jffs2_inodirty; | 30 | struct jffs2_inodirty; |
31 | 31 | ||
32 | struct jffs2_mount_opts { | ||
33 | bool override_compr; | ||
34 | unsigned int compr; | ||
35 | }; | ||
36 | |||
32 | /* A struct for the overall file system control. Pointers to | 37 | /* A struct for the overall file system control. Pointers to |
33 | jffs2_sb_info structs are named `c' in the source code. | 38 | jffs2_sb_info structs are named `c' in the source code. |
34 | Nee jffs_control | 39 | Nee jffs_control |
@@ -126,6 +131,7 @@ struct jffs2_sb_info { | |||
126 | #endif | 131 | #endif |
127 | 132 | ||
128 | struct jffs2_summary *summary; /* Summary information */ | 133 | struct jffs2_summary *summary; /* Summary information */ |
134 | struct jffs2_mount_opts mount_opts; | ||
129 | 135 | ||
130 | #ifdef CONFIG_JFFS2_FS_XATTR | 136 | #ifdef CONFIG_JFFS2_FS_XATTR |
131 | #define XATTRINDEX_HASHSIZE (57) | 137 | #define XATTRINDEX_HASHSIZE (57) |
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 6c1755c59c0..ab65ee3ec85 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h | |||
@@ -176,7 +176,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags); | |||
176 | struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, | 176 | struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, |
177 | struct jffs2_raw_inode *ri); | 177 | struct jffs2_raw_inode *ri); |
178 | int jffs2_statfs (struct dentry *, struct kstatfs *); | 178 | int jffs2_statfs (struct dentry *, struct kstatfs *); |
179 | int jffs2_remount_fs (struct super_block *, int *, char *); | 179 | int jffs2_do_remount_fs(struct super_block *, int *, char *); |
180 | int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); | 180 | int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); |
181 | void jffs2_gc_release_inode(struct jffs2_sb_info *c, | 181 | void jffs2_gc_release_inode(struct jffs2_sb_info *c, |
182 | struct jffs2_inode_info *f); | 182 | struct jffs2_inode_info *f); |
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 8d8cd3419d0..28107ca136e 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c | |||
@@ -275,9 +275,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) | |||
275 | else | 275 | else |
276 | c->mtd->unpoint(c->mtd, 0, c->mtd->size); | 276 | c->mtd->unpoint(c->mtd, 0, c->mtd->size); |
277 | #endif | 277 | #endif |
278 | if (s) | 278 | kfree(s); |
279 | kfree(s); | ||
280 | |||
281 | return ret; | 279 | return ret; |
282 | } | 280 | } |
283 | 281 | ||
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 853b8e30008..e7e97445411 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c | |||
@@ -17,11 +17,13 @@ | |||
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/err.h> | 18 | #include <linux/err.h> |
19 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
20 | #include <linux/parser.h> | ||
20 | #include <linux/jffs2.h> | 21 | #include <linux/jffs2.h> |
21 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
22 | #include <linux/mtd/super.h> | 23 | #include <linux/mtd/super.h> |
23 | #include <linux/ctype.h> | 24 | #include <linux/ctype.h> |
24 | #include <linux/namei.h> | 25 | #include <linux/namei.h> |
26 | #include <linux/seq_file.h> | ||
25 | #include <linux/exportfs.h> | 27 | #include <linux/exportfs.h> |
26 | #include "compr.h" | 28 | #include "compr.h" |
27 | #include "nodelist.h" | 29 | #include "nodelist.h" |
@@ -75,6 +77,37 @@ static void jffs2_write_super(struct super_block *sb) | |||
75 | unlock_super(sb); | 77 | unlock_super(sb); |
76 | } | 78 | } |
77 | 79 | ||
80 | static const char *jffs2_compr_name(unsigned int compr) | ||
81 | { | ||
82 | switch (compr) { | ||
83 | case JFFS2_COMPR_MODE_NONE: | ||
84 | return "none"; | ||
85 | #ifdef CONFIG_JFFS2_LZO | ||
86 | case JFFS2_COMPR_MODE_FORCELZO: | ||
87 | return "lzo"; | ||
88 | #endif | ||
89 | #ifdef CONFIG_JFFS2_ZLIB | ||
90 | case JFFS2_COMPR_MODE_FORCEZLIB: | ||
91 | return "zlib"; | ||
92 | #endif | ||
93 | default: | ||
94 | /* should never happen; programmer error */ | ||
95 | WARN_ON(1); | ||
96 | return ""; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | static int jffs2_show_options(struct seq_file *s, struct vfsmount *mnt) | ||
101 | { | ||
102 | struct jffs2_sb_info *c = JFFS2_SB_INFO(mnt->mnt_sb); | ||
103 | struct jffs2_mount_opts *opts = &c->mount_opts; | ||
104 | |||
105 | if (opts->override_compr) | ||
106 | seq_printf(s, ",compr=%s", jffs2_compr_name(opts->compr)); | ||
107 | |||
108 | return 0; | ||
109 | } | ||
110 | |||
78 | static int jffs2_sync_fs(struct super_block *sb, int wait) | 111 | static int jffs2_sync_fs(struct super_block *sb, int wait) |
79 | { | 112 | { |
80 | struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); | 113 | struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); |
@@ -133,6 +166,85 @@ static const struct export_operations jffs2_export_ops = { | |||
133 | .fh_to_parent = jffs2_fh_to_parent, | 166 | .fh_to_parent = jffs2_fh_to_parent, |
134 | }; | 167 | }; |
135 | 168 | ||
169 | /* | ||
170 | * JFFS2 mount options. | ||
171 | * | ||
172 | * Opt_override_compr: override default compressor | ||
173 | * Opt_err: just end of array marker | ||
174 | */ | ||
175 | enum { | ||
176 | Opt_override_compr, | ||
177 | Opt_err, | ||
178 | }; | ||
179 | |||
180 | static const match_table_t tokens = { | ||
181 | {Opt_override_compr, "compr=%s"}, | ||
182 | {Opt_err, NULL}, | ||
183 | }; | ||
184 | |||
185 | static int jffs2_parse_options(struct jffs2_sb_info *c, char *data) | ||
186 | { | ||
187 | substring_t args[MAX_OPT_ARGS]; | ||
188 | char *p, *name; | ||
189 | |||
190 | if (!data) | ||
191 | return 0; | ||
192 | |||
193 | while ((p = strsep(&data, ","))) { | ||
194 | int token; | ||
195 | |||
196 | if (!*p) | ||
197 | continue; | ||
198 | |||
199 | token = match_token(p, tokens, args); | ||
200 | switch (token) { | ||
201 | case Opt_override_compr: | ||
202 | name = match_strdup(&args[0]); | ||
203 | |||
204 | if (!name) | ||
205 | return -ENOMEM; | ||
206 | if (!strcmp(name, "none")) | ||
207 | c->mount_opts.compr = JFFS2_COMPR_MODE_NONE; | ||
208 | #ifdef CONFIG_JFFS2_LZO | ||
209 | else if (!strcmp(name, "lzo")) | ||
210 | c->mount_opts.compr = JFFS2_COMPR_MODE_FORCELZO; | ||
211 | #endif | ||
212 | #ifdef CONFIG_JFFS2_ZLIB | ||
213 | else if (!strcmp(name, "zlib")) | ||
214 | c->mount_opts.compr = | ||
215 | JFFS2_COMPR_MODE_FORCEZLIB; | ||
216 | #endif | ||
217 | else { | ||
218 | printk(KERN_ERR "JFFS2 Error: unknown compressor \"%s\"", | ||
219 | name); | ||
220 | kfree(name); | ||
221 | return -EINVAL; | ||
222 | } | ||
223 | kfree(name); | ||
224 | c->mount_opts.override_compr = true; | ||
225 | break; | ||
226 | default: | ||
227 | printk(KERN_ERR "JFFS2 Error: unrecognized mount option '%s' or missing value\n", | ||
228 | p); | ||
229 | return -EINVAL; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data) | ||
237 | { | ||
238 | struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); | ||
239 | int err; | ||
240 | |||
241 | err = jffs2_parse_options(c, data); | ||
242 | if (err) | ||
243 | return -EINVAL; | ||
244 | |||
245 | return jffs2_do_remount_fs(sb, flags, data); | ||
246 | } | ||
247 | |||
136 | static const struct super_operations jffs2_super_operations = | 248 | static const struct super_operations jffs2_super_operations = |
137 | { | 249 | { |
138 | .alloc_inode = jffs2_alloc_inode, | 250 | .alloc_inode = jffs2_alloc_inode, |
@@ -143,6 +255,7 @@ static const struct super_operations jffs2_super_operations = | |||
143 | .remount_fs = jffs2_remount_fs, | 255 | .remount_fs = jffs2_remount_fs, |
144 | .evict_inode = jffs2_evict_inode, | 256 | .evict_inode = jffs2_evict_inode, |
145 | .dirty_inode = jffs2_dirty_inode, | 257 | .dirty_inode = jffs2_dirty_inode, |
258 | .show_options = jffs2_show_options, | ||
146 | .sync_fs = jffs2_sync_fs, | 259 | .sync_fs = jffs2_sync_fs, |
147 | }; | 260 | }; |
148 | 261 | ||
@@ -166,6 +279,12 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) | |||
166 | c->os_priv = sb; | 279 | c->os_priv = sb; |
167 | sb->s_fs_info = c; | 280 | sb->s_fs_info = c; |
168 | 281 | ||
282 | ret = jffs2_parse_options(c, data); | ||
283 | if (ret) { | ||
284 | kfree(c); | ||
285 | return -EINVAL; | ||
286 | } | ||
287 | |||
169 | /* Initialize JFFS2 superblock locks, the further initialization will | 288 | /* Initialize JFFS2 superblock locks, the further initialization will |
170 | * be done later */ | 289 | * be done later */ |
171 | mutex_init(&c->alloc_sem); | 290 | mutex_init(&c->alloc_sem); |
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4515bea0268..b09e51d2f81 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c | |||
@@ -578,8 +578,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) | |||
578 | if (!jffs2_is_writebuffered(c)) | 578 | if (!jffs2_is_writebuffered(c)) |
579 | return 0; | 579 | return 0; |
580 | 580 | ||
581 | if (mutex_trylock(&c->alloc_sem)) { | 581 | if (!mutex_is_locked(&c->alloc_sem)) { |
582 | mutex_unlock(&c->alloc_sem); | ||
583 | printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); | 582 | printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); |
584 | BUG(); | 583 | BUG(); |
585 | } | 584 | } |
@@ -1026,7 +1025,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c, | |||
1026 | int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); | 1025 | int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); |
1027 | struct mtd_oob_ops ops; | 1026 | struct mtd_oob_ops ops; |
1028 | 1027 | ||
1029 | ops.mode = MTD_OOB_AUTO; | 1028 | ops.mode = MTD_OPS_AUTO_OOB; |
1030 | ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; | 1029 | ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail; |
1031 | ops.oobbuf = c->oobbuf; | 1030 | ops.oobbuf = c->oobbuf; |
1032 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; | 1031 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; |
@@ -1069,7 +1068,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c, | |||
1069 | struct mtd_oob_ops ops; | 1068 | struct mtd_oob_ops ops; |
1070 | int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); | 1069 | int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); |
1071 | 1070 | ||
1072 | ops.mode = MTD_OOB_AUTO; | 1071 | ops.mode = MTD_OPS_AUTO_OOB; |
1073 | ops.ooblen = cmlen; | 1072 | ops.ooblen = cmlen; |
1074 | ops.oobbuf = c->oobbuf; | 1073 | ops.oobbuf = c->oobbuf; |
1075 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; | 1074 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; |
@@ -1095,7 +1094,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c, | |||
1095 | struct mtd_oob_ops ops; | 1094 | struct mtd_oob_ops ops; |
1096 | int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); | 1095 | int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE); |
1097 | 1096 | ||
1098 | ops.mode = MTD_OOB_AUTO; | 1097 | ops.mode = MTD_OPS_AUTO_OOB; |
1099 | ops.ooblen = cmlen; | 1098 | ops.ooblen = cmlen; |
1100 | ops.oobbuf = (uint8_t *)&oob_cleanmarker; | 1099 | ops.oobbuf = (uint8_t *)&oob_cleanmarker; |
1101 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; | 1100 | ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; |
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 583636f745e..cc5f811ed38 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c | |||
@@ -67,6 +67,7 @@ | |||
67 | #include <linux/buffer_head.h> /* for sync_blockdev() */ | 67 | #include <linux/buffer_head.h> /* for sync_blockdev() */ |
68 | #include <linux/bio.h> | 68 | #include <linux/bio.h> |
69 | #include <linux/freezer.h> | 69 | #include <linux/freezer.h> |
70 | #include <linux/export.h> | ||
70 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
71 | #include <linux/mutex.h> | 72 | #include <linux/mutex.h> |
72 | #include <linux/seq_file.h> | 73 | #include <linux/seq_file.h> |
diff --git a/fs/logfs/super.c b/fs/logfs/super.c index f2697e4df10..e795c234ea3 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/bio.h> | 13 | #include <linux/bio.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
16 | #include <linux/module.h> | ||
16 | #include <linux/mtd/mtd.h> | 17 | #include <linux/mtd/mtd.h> |
17 | #include <linux/statfs.h> | 18 | #include <linux/statfs.h> |
18 | #include <linux/buffer_head.h> | 19 | #include <linux/buffer_head.h> |
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb0d9b..ef175cb8cfd 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c | |||
@@ -16,38 +16,26 @@ | |||
16 | #include <linux/bitops.h> | 16 | #include <linux/bitops.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | 18 | ||
19 | static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; | ||
20 | |||
21 | static DEFINE_SPINLOCK(bitmap_lock); | 19 | static DEFINE_SPINLOCK(bitmap_lock); |
22 | 20 | ||
23 | static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) | 21 | /* |
22 | * bitmap consists of blocks filled with 16bit words | ||
23 | * bit set == busy, bit clear == free | ||
24 | * endianness is a mess, but for counting zero bits it really doesn't matter... | ||
25 | */ | ||
26 | static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) | ||
24 | { | 27 | { |
25 | unsigned i, j, sum = 0; | 28 | __u32 sum = 0; |
26 | struct buffer_head *bh; | 29 | unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); |
27 | |||
28 | for (i=0; i<numblocks-1; i++) { | ||
29 | if (!(bh=map[i])) | ||
30 | return(0); | ||
31 | for (j=0; j<bh->b_size; j++) | ||
32 | sum += nibblemap[bh->b_data[j] & 0xf] | ||
33 | + nibblemap[(bh->b_data[j]>>4) & 0xf]; | ||
34 | } | ||
35 | 30 | ||
36 | if (numblocks==0 || !(bh=map[numblocks-1])) | 31 | while (blocks--) { |
37 | return(0); | 32 | unsigned words = blocksize / 2; |
38 | i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; | 33 | __u16 *p = (__u16 *)(*map++)->b_data; |
39 | for (j=0; j<i; j++) { | 34 | while (words--) |
40 | sum += nibblemap[bh->b_data[j] & 0xf] | 35 | sum += 16 - hweight16(*p++); |
41 | + nibblemap[(bh->b_data[j]>>4) & 0xf]; | ||
42 | } | 36 | } |
43 | 37 | ||
44 | i = numbits%16; | 38 | return sum; |
45 | if (i!=0) { | ||
46 | i = *(__u16 *)(&bh->b_data[j]) | ~((1<<i) - 1); | ||
47 | sum += nibblemap[i & 0xf] + nibblemap[(i>>4) & 0xf]; | ||
48 | sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; | ||
49 | } | ||
50 | return(sum); | ||
51 | } | 39 | } |
52 | 40 | ||
53 | void minix_free_block(struct inode *inode, unsigned long block) | 41 | void minix_free_block(struct inode *inode, unsigned long block) |
@@ -105,10 +93,12 @@ int minix_new_block(struct inode * inode) | |||
105 | return 0; | 93 | return 0; |
106 | } | 94 | } |
107 | 95 | ||
108 | unsigned long minix_count_free_blocks(struct minix_sb_info *sbi) | 96 | unsigned long minix_count_free_blocks(struct super_block *sb) |
109 | { | 97 | { |
110 | return (count_free(sbi->s_zmap, sbi->s_zmap_blocks, | 98 | struct minix_sb_info *sbi = minix_sb(sb); |
111 | sbi->s_nzones - sbi->s_firstdatazone + 1) | 99 | u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); |
100 | |||
101 | return (count_free(sbi->s_zmap, sb->s_blocksize, bits) | ||
112 | << sbi->s_log_zone_size); | 102 | << sbi->s_log_zone_size); |
113 | } | 103 | } |
114 | 104 | ||
@@ -273,7 +263,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) | |||
273 | return inode; | 263 | return inode; |
274 | } | 264 | } |
275 | 265 | ||
276 | unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) | 266 | unsigned long minix_count_free_inodes(struct super_block *sb) |
277 | { | 267 | { |
278 | return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); | 268 | struct minix_sb_info *sbi = minix_sb(sb); |
269 | u32 bits = sbi->s_ninodes + 1; | ||
270 | |||
271 | return count_free(sbi->s_imap, sb->s_blocksize, bits); | ||
279 | } | 272 | } |
diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 64cdcd662ff..1d9e33966db 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c | |||
@@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) | |||
279 | else if (sbi->s_mount_state & MINIX_ERROR_FS) | 279 | else if (sbi->s_mount_state & MINIX_ERROR_FS) |
280 | printk("MINIX-fs: mounting file system with errors, " | 280 | printk("MINIX-fs: mounting file system with errors, " |
281 | "running fsck is recommended\n"); | 281 | "running fsck is recommended\n"); |
282 | |||
283 | /* Apparently minix can create filesystems that allocate more blocks for | ||
284 | * the bitmaps than needed. We simply ignore that, but verify it didn't | ||
285 | * create one with not enough blocks and bail out if so. | ||
286 | */ | ||
287 | block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); | ||
288 | if (sbi->s_imap_blocks < block) { | ||
289 | printk("MINIX-fs: file system does not have enough " | ||
290 | "imap blocks allocated. Refusing to mount\n"); | ||
291 | goto out_iput; | ||
292 | } | ||
293 | |||
294 | block = minix_blocks_needed( | ||
295 | (sbi->s_nzones - (sbi->s_firstdatazone + 1)), | ||
296 | s->s_blocksize); | ||
297 | if (sbi->s_zmap_blocks < block) { | ||
298 | printk("MINIX-fs: file system does not have enough " | ||
299 | "zmap blocks allocated. Refusing to mount.\n"); | ||
300 | goto out_iput; | ||
301 | } | ||
302 | |||
282 | return 0; | 303 | return 0; |
283 | 304 | ||
284 | out_iput: | 305 | out_iput: |
@@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
339 | buf->f_type = sb->s_magic; | 360 | buf->f_type = sb->s_magic; |
340 | buf->f_bsize = sb->s_blocksize; | 361 | buf->f_bsize = sb->s_blocksize; |
341 | buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; | 362 | buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; |
342 | buf->f_bfree = minix_count_free_blocks(sbi); | 363 | buf->f_bfree = minix_count_free_blocks(sb); |
343 | buf->f_bavail = buf->f_bfree; | 364 | buf->f_bavail = buf->f_bfree; |
344 | buf->f_files = sbi->s_ninodes; | 365 | buf->f_files = sbi->s_ninodes; |
345 | buf->f_ffree = minix_count_free_inodes(sbi); | 366 | buf->f_ffree = minix_count_free_inodes(sb); |
346 | buf->f_namelen = sbi->s_namelen; | 367 | buf->f_namelen = sbi->s_namelen; |
347 | buf->f_fsid.val[0] = (u32)id; | 368 | buf->f_fsid.val[0] = (u32)id; |
348 | buf->f_fsid.val[1] = (u32)(id >> 32); | 369 | buf->f_fsid.val[1] = (u32)(id >> 32); |
diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e2122879..26bbd55e82e 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h | |||
@@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru | |||
48 | extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); | 48 | extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); |
49 | extern struct inode * minix_new_inode(const struct inode *, int, int *); | 49 | extern struct inode * minix_new_inode(const struct inode *, int, int *); |
50 | extern void minix_free_inode(struct inode * inode); | 50 | extern void minix_free_inode(struct inode * inode); |
51 | extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); | 51 | extern unsigned long minix_count_free_inodes(struct super_block *sb); |
52 | extern int minix_new_block(struct inode * inode); | 52 | extern int minix_new_block(struct inode * inode); |
53 | extern void minix_free_block(struct inode *inode, unsigned long block); | 53 | extern void minix_free_block(struct inode *inode, unsigned long block); |
54 | extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); | 54 | extern unsigned long minix_count_free_blocks(struct super_block *sb); |
55 | extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); | 55 | extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); |
56 | extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); | 56 | extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); |
57 | 57 | ||
@@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) | |||
88 | return list_entry(inode, struct minix_inode_info, vfs_inode); | 88 | return list_entry(inode, struct minix_inode_info, vfs_inode); |
89 | } | 89 | } |
90 | 90 | ||
91 | static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) | ||
92 | { | ||
93 | return DIV_ROUND_UP(bits, blocksize * 8); | ||
94 | } | ||
95 | |||
91 | #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ | 96 | #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ |
92 | defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) | 97 | defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) |
93 | 98 | ||
@@ -125,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) | |||
125 | if (!size) | 130 | if (!size) |
126 | return 0; | 131 | return 0; |
127 | 132 | ||
128 | size = (size >> 4) + ((size & 15) > 0); | 133 | size >>= 4; |
129 | while (*p++ == 0xffff) { | 134 | while (*p++ == 0xffff) { |
130 | if (--size == 0) | 135 | if (--size == 0) |
131 | return (p - addr) << 4; | 136 | return (p - addr) << 4; |
diff --git a/fs/namei.c b/fs/namei.c index ac6d214da82..5008f01787f 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -852,7 +852,7 @@ static int follow_managed(struct path *path, unsigned flags) | |||
852 | mntput(path->mnt); | 852 | mntput(path->mnt); |
853 | if (ret == -EISDIR) | 853 | if (ret == -EISDIR) |
854 | ret = 0; | 854 | ret = 0; |
855 | return ret; | 855 | return ret < 0 ? ret : need_mntput; |
856 | } | 856 | } |
857 | 857 | ||
858 | int follow_down_one(struct path *path) | 858 | int follow_down_one(struct path *path) |
@@ -900,6 +900,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, | |||
900 | break; | 900 | break; |
901 | path->mnt = mounted; | 901 | path->mnt = mounted; |
902 | path->dentry = mounted->mnt_root; | 902 | path->dentry = mounted->mnt_root; |
903 | nd->flags |= LOOKUP_JUMPED; | ||
903 | nd->seq = read_seqcount_begin(&path->dentry->d_seq); | 904 | nd->seq = read_seqcount_begin(&path->dentry->d_seq); |
904 | /* | 905 | /* |
905 | * Update the inode too. We don't need to re-check the | 906 | * Update the inode too. We don't need to re-check the |
@@ -1213,6 +1214,8 @@ retry: | |||
1213 | path_put_conditional(path, nd); | 1214 | path_put_conditional(path, nd); |
1214 | return err; | 1215 | return err; |
1215 | } | 1216 | } |
1217 | if (err) | ||
1218 | nd->flags |= LOOKUP_JUMPED; | ||
1216 | *inode = path->dentry->d_inode; | 1219 | *inode = path->dentry->d_inode; |
1217 | return 0; | 1220 | return 0; |
1218 | } | 1221 | } |
@@ -2146,6 +2149,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
2146 | } | 2149 | } |
2147 | 2150 | ||
2148 | /* create side of things */ | 2151 | /* create side of things */ |
2152 | /* | ||
2153 | * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been | ||
2154 | * cleared when we got to the last component we are about to look up | ||
2155 | */ | ||
2149 | error = complete_walk(nd); | 2156 | error = complete_walk(nd); |
2150 | if (error) | 2157 | if (error) |
2151 | return ERR_PTR(error); | 2158 | return ERR_PTR(error); |
@@ -2214,6 +2221,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
2214 | if (error < 0) | 2221 | if (error < 0) |
2215 | goto exit_dput; | 2222 | goto exit_dput; |
2216 | 2223 | ||
2224 | if (error) | ||
2225 | nd->flags |= LOOKUP_JUMPED; | ||
2226 | |||
2217 | error = -ENOENT; | 2227 | error = -ENOENT; |
2218 | if (!path->dentry->d_inode) | 2228 | if (!path->dentry->d_inode) |
2219 | goto exit_dput; | 2229 | goto exit_dput; |
@@ -2223,6 +2233,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path, | |||
2223 | 2233 | ||
2224 | path_to_nameidata(path, nd); | 2234 | path_to_nameidata(path, nd); |
2225 | nd->inode = path->dentry->d_inode; | 2235 | nd->inode = path->dentry->d_inode; |
2236 | /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ | ||
2237 | error = complete_walk(nd); | ||
2238 | if (error) | ||
2239 | goto exit; | ||
2226 | error = -EISDIR; | 2240 | error = -EISDIR; |
2227 | if (S_ISDIR(nd->inode->i_mode)) | 2241 | if (S_ISDIR(nd->inode->i_mode)) |
2228 | goto exit; | 2242 | goto exit; |
diff --git a/fs/namespace.c b/fs/namespace.c index e5e1c7d1839..cfc6d4448aa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c | |||
@@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v) | |||
1048 | if (err) | 1048 | if (err) |
1049 | goto out; | 1049 | goto out; |
1050 | seq_putc(m, ' '); | 1050 | seq_putc(m, ' '); |
1051 | seq_path_root(m, &mnt_path, &root, " \t\n\\"); | 1051 | |
1052 | if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { | 1052 | /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ |
1053 | /* | 1053 | err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); |
1054 | * Mountpoint is outside root, discard that one. Ugly, | 1054 | if (err) |
1055 | * but less so than trying to do that in iterator in a | 1055 | goto out; |
1056 | * race-free way (due to renames). | 1056 | |
1057 | */ | ||
1058 | return SEQ_SKIP; | ||
1059 | } | ||
1060 | seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); | 1057 | seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); |
1061 | show_mnt_opts(m, mnt); | 1058 | show_mnt_opts(m, mnt); |
1062 | 1059 | ||
@@ -2483,11 +2480,43 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) | |||
2483 | __mnt_make_longterm(mnt); | 2480 | __mnt_make_longterm(mnt); |
2484 | new_ns->root = mnt; | 2481 | new_ns->root = mnt; |
2485 | list_add(&new_ns->list, &new_ns->root->mnt_list); | 2482 | list_add(&new_ns->list, &new_ns->root->mnt_list); |
2483 | } else { | ||
2484 | mntput(mnt); | ||
2486 | } | 2485 | } |
2487 | return new_ns; | 2486 | return new_ns; |
2488 | } | 2487 | } |
2489 | EXPORT_SYMBOL(create_mnt_ns); | 2488 | EXPORT_SYMBOL(create_mnt_ns); |
2490 | 2489 | ||
2490 | struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) | ||
2491 | { | ||
2492 | struct mnt_namespace *ns; | ||
2493 | struct super_block *s; | ||
2494 | struct path path; | ||
2495 | int err; | ||
2496 | |||
2497 | ns = create_mnt_ns(mnt); | ||
2498 | if (IS_ERR(ns)) | ||
2499 | return ERR_CAST(ns); | ||
2500 | |||
2501 | err = vfs_path_lookup(mnt->mnt_root, mnt, | ||
2502 | name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); | ||
2503 | |||
2504 | put_mnt_ns(ns); | ||
2505 | |||
2506 | if (err) | ||
2507 | return ERR_PTR(err); | ||
2508 | |||
2509 | /* trade a vfsmount reference for active sb one */ | ||
2510 | s = path.mnt->mnt_sb; | ||
2511 | atomic_inc(&s->s_active); | ||
2512 | mntput(path.mnt); | ||
2513 | /* lock the sucker */ | ||
2514 | down_write(&s->s_umount); | ||
2515 | /* ... and return the root of (sub)tree on it */ | ||
2516 | return path.dentry; | ||
2517 | } | ||
2518 | EXPORT_SYMBOL(mount_subtree); | ||
2519 | |||
2491 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, | 2520 | SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, |
2492 | char __user *, type, unsigned long, flags, void __user *, data) | 2521 | char __user *, type, unsigned long, flags, void __user *, data) |
2493 | { | 2522 | { |
@@ -2744,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt) | |||
2744 | } | 2773 | } |
2745 | } | 2774 | } |
2746 | EXPORT_SYMBOL(kern_unmount); | 2775 | EXPORT_SYMBOL(kern_unmount); |
2776 | |||
2777 | bool our_mnt(struct vfsmount *mnt) | ||
2778 | { | ||
2779 | return check_mnt(mnt); | ||
2780 | } | ||
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 918ad647afe..726e59a9e50 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c | |||
@@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, | |||
488 | struct xdr_stream *xdr, | 488 | struct xdr_stream *xdr, |
489 | struct cb_recallanyargs *args) | 489 | struct cb_recallanyargs *args) |
490 | { | 490 | { |
491 | __be32 *p; | 491 | uint32_t bitmap[2]; |
492 | __be32 *p, status; | ||
492 | 493 | ||
493 | args->craa_addr = svc_addr(rqstp); | 494 | args->craa_addr = svc_addr(rqstp); |
494 | p = read_buf(xdr, 4); | 495 | p = read_buf(xdr, 4); |
495 | if (unlikely(p == NULL)) | 496 | if (unlikely(p == NULL)) |
496 | return htonl(NFS4ERR_BADXDR); | 497 | return htonl(NFS4ERR_BADXDR); |
497 | args->craa_objs_to_keep = ntohl(*p++); | 498 | args->craa_objs_to_keep = ntohl(*p++); |
498 | p = read_buf(xdr, 4); | 499 | status = decode_bitmap(xdr, bitmap); |
499 | if (unlikely(p == NULL)) | 500 | if (unlikely(status)) |
500 | return htonl(NFS4ERR_BADXDR); | 501 | return status; |
501 | args->craa_type_mask = ntohl(*p); | 502 | args->craa_type_mask = bitmap[0]; |
502 | 503 | ||
503 | return 0; | 504 | return 0; |
504 | } | 505 | } |
@@ -986,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { | |||
986 | .vs_proc = nfs4_callback_procedures1, | 987 | .vs_proc = nfs4_callback_procedures1, |
987 | .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, | 988 | .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, |
988 | .vs_dispatch = NULL, | 989 | .vs_dispatch = NULL, |
990 | .vs_hidden = 1, | ||
989 | }; | 991 | }; |
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b238d95ac48..ac289909814 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c | |||
@@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry | |||
1468 | res = NULL; | 1468 | res = NULL; |
1469 | goto out; | 1469 | goto out; |
1470 | /* This turned out not to be a regular file */ | 1470 | /* This turned out not to be a regular file */ |
1471 | case -EISDIR: | ||
1471 | case -ENOTDIR: | 1472 | case -ENOTDIR: |
1472 | goto no_open; | 1473 | goto no_open; |
1473 | case -ELOOP: | 1474 | case -ELOOP: |
1474 | if (!(nd->intent.open.flags & O_NOFOLLOW)) | 1475 | if (!(nd->intent.open.flags & O_NOFOLLOW)) |
1475 | goto no_open; | 1476 | goto no_open; |
1476 | /* case -EISDIR: */ | ||
1477 | /* case -EINVAL: */ | 1477 | /* case -EINVAL: */ |
1478 | default: | 1478 | default: |
1479 | res = ERR_CAST(inode); | 1479 | res = ERR_CAST(inode); |
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 91c01f0a4c3..eca56d4b39c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -40,48 +40,8 @@ | |||
40 | 40 | ||
41 | #define NFSDBG_FACILITY NFSDBG_FILE | 41 | #define NFSDBG_FACILITY NFSDBG_FILE |
42 | 42 | ||
43 | static int nfs_file_open(struct inode *, struct file *); | ||
44 | static int nfs_file_release(struct inode *, struct file *); | ||
45 | static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); | ||
46 | static int nfs_file_mmap(struct file *, struct vm_area_struct *); | ||
47 | static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, | ||
48 | struct pipe_inode_info *pipe, | ||
49 | size_t count, unsigned int flags); | ||
50 | static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, | ||
51 | unsigned long nr_segs, loff_t pos); | ||
52 | static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, | ||
53 | struct file *filp, loff_t *ppos, | ||
54 | size_t count, unsigned int flags); | ||
55 | static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, | ||
56 | unsigned long nr_segs, loff_t pos); | ||
57 | static int nfs_file_flush(struct file *, fl_owner_t id); | ||
58 | static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); | ||
59 | static int nfs_check_flags(int flags); | ||
60 | static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); | ||
61 | static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); | ||
62 | static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); | ||
63 | |||
64 | static const struct vm_operations_struct nfs_file_vm_ops; | 43 | static const struct vm_operations_struct nfs_file_vm_ops; |
65 | 44 | ||
66 | const struct file_operations nfs_file_operations = { | ||
67 | .llseek = nfs_file_llseek, | ||
68 | .read = do_sync_read, | ||
69 | .write = do_sync_write, | ||
70 | .aio_read = nfs_file_read, | ||
71 | .aio_write = nfs_file_write, | ||
72 | .mmap = nfs_file_mmap, | ||
73 | .open = nfs_file_open, | ||
74 | .flush = nfs_file_flush, | ||
75 | .release = nfs_file_release, | ||
76 | .fsync = nfs_file_fsync, | ||
77 | .lock = nfs_lock, | ||
78 | .flock = nfs_flock, | ||
79 | .splice_read = nfs_file_splice_read, | ||
80 | .splice_write = nfs_file_splice_write, | ||
81 | .check_flags = nfs_check_flags, | ||
82 | .setlease = nfs_setlease, | ||
83 | }; | ||
84 | |||
85 | const struct inode_operations nfs_file_inode_operations = { | 45 | const struct inode_operations nfs_file_inode_operations = { |
86 | .permission = nfs_permission, | 46 | .permission = nfs_permission, |
87 | .getattr = nfs_getattr, | 47 | .getattr = nfs_getattr, |
@@ -137,11 +97,9 @@ nfs_file_open(struct inode *inode, struct file *filp) | |||
137 | static int | 97 | static int |
138 | nfs_file_release(struct inode *inode, struct file *filp) | 98 | nfs_file_release(struct inode *inode, struct file *filp) |
139 | { | 99 | { |
140 | struct dentry *dentry = filp->f_path.dentry; | ||
141 | |||
142 | dprintk("NFS: release(%s/%s)\n", | 100 | dprintk("NFS: release(%s/%s)\n", |
143 | dentry->d_parent->d_name.name, | 101 | filp->f_path.dentry->d_parent->d_name.name, |
144 | dentry->d_name.name); | 102 | filp->f_path.dentry->d_name.name); |
145 | 103 | ||
146 | nfs_inc_stats(inode, NFSIOS_VFSRELEASE); | 104 | nfs_inc_stats(inode, NFSIOS_VFSRELEASE); |
147 | return nfs_release(inode, filp); | 105 | return nfs_release(inode, filp); |
@@ -228,14 +186,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, | |||
228 | struct dentry * dentry = iocb->ki_filp->f_path.dentry; | 186 | struct dentry * dentry = iocb->ki_filp->f_path.dentry; |
229 | struct inode * inode = dentry->d_inode; | 187 | struct inode * inode = dentry->d_inode; |
230 | ssize_t result; | 188 | ssize_t result; |
231 | size_t count = iov_length(iov, nr_segs); | ||
232 | 189 | ||
233 | if (iocb->ki_filp->f_flags & O_DIRECT) | 190 | if (iocb->ki_filp->f_flags & O_DIRECT) |
234 | return nfs_file_direct_read(iocb, iov, nr_segs, pos); | 191 | return nfs_file_direct_read(iocb, iov, nr_segs, pos); |
235 | 192 | ||
236 | dprintk("NFS: read(%s/%s, %lu@%lu)\n", | 193 | dprintk("NFS: read(%s/%s, %lu@%lu)\n", |
237 | dentry->d_parent->d_name.name, dentry->d_name.name, | 194 | dentry->d_parent->d_name.name, dentry->d_name.name, |
238 | (unsigned long) count, (unsigned long) pos); | 195 | (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); |
239 | 196 | ||
240 | result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); | 197 | result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); |
241 | if (!result) { | 198 | if (!result) { |
@@ -889,3 +846,54 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) | |||
889 | file->f_path.dentry->d_name.name, arg); | 846 | file->f_path.dentry->d_name.name, arg); |
890 | return -EINVAL; | 847 | return -EINVAL; |
891 | } | 848 | } |
849 | |||
850 | const struct file_operations nfs_file_operations = { | ||
851 | .llseek = nfs_file_llseek, | ||
852 | .read = do_sync_read, | ||
853 | .write = do_sync_write, | ||
854 | .aio_read = nfs_file_read, | ||
855 | .aio_write = nfs_file_write, | ||
856 | .mmap = nfs_file_mmap, | ||
857 | .open = nfs_file_open, | ||
858 | .flush = nfs_file_flush, | ||
859 | .release = nfs_file_release, | ||
860 | .fsync = nfs_file_fsync, | ||
861 | .lock = nfs_lock, | ||
862 | .flock = nfs_flock, | ||
863 | .splice_read = nfs_file_splice_read, | ||
864 | .splice_write = nfs_file_splice_write, | ||
865 | .check_flags = nfs_check_flags, | ||
866 | .setlease = nfs_setlease, | ||
867 | }; | ||
868 | |||
869 | #ifdef CONFIG_NFS_V4 | ||
870 | static int | ||
871 | nfs4_file_open(struct inode *inode, struct file *filp) | ||
872 | { | ||
873 | /* | ||
874 | * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to | ||
875 | * this point, then something is very wrong | ||
876 | */ | ||
877 | dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); | ||
878 | return -ENOTDIR; | ||
879 | } | ||
880 | |||
881 | const struct file_operations nfs4_file_operations = { | ||
882 | .llseek = nfs_file_llseek, | ||
883 | .read = do_sync_read, | ||
884 | .write = do_sync_write, | ||
885 | .aio_read = nfs_file_read, | ||
886 | .aio_write = nfs_file_write, | ||
887 | .mmap = nfs_file_mmap, | ||
888 | .open = nfs4_file_open, | ||
889 | .flush = nfs_file_flush, | ||
890 | .release = nfs_file_release, | ||
891 | .fsync = nfs_file_fsync, | ||
892 | .lock = nfs_lock, | ||
893 | .flock = nfs_flock, | ||
894 | .splice_read = nfs_file_splice_read, | ||
895 | .splice_write = nfs_file_splice_write, | ||
896 | .check_flags = nfs_check_flags, | ||
897 | .setlease = nfs_setlease, | ||
898 | }; | ||
899 | #endif /* CONFIG_NFS_V4 */ | ||
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c07a55aec83..50a15fa8cf9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c | |||
@@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) | |||
291 | */ | 291 | */ |
292 | inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; | 292 | inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; |
293 | if (S_ISREG(inode->i_mode)) { | 293 | if (S_ISREG(inode->i_mode)) { |
294 | inode->i_fop = &nfs_file_operations; | 294 | inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; |
295 | inode->i_data.a_ops = &nfs_file_aops; | 295 | inode->i_data.a_ops = &nfs_file_aops; |
296 | inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; | 296 | inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; |
297 | } else if (S_ISDIR(inode->i_mode)) { | 297 | } else if (S_ISDIR(inode->i_mode)) { |
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c1a1bd8ddf1..3f4d95751d5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); | |||
299 | extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, | 299 | extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, |
300 | struct list_head *head); | 300 | struct list_head *head); |
301 | 301 | ||
302 | extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, | ||
303 | struct inode *inode); | ||
302 | extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); | 304 | extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); |
303 | extern void nfs_readdata_release(struct nfs_read_data *rdata); | 305 | extern void nfs_readdata_release(struct nfs_read_data *rdata); |
304 | 306 | ||
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 85f1690ca08..d4bc9ed9174 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c | |||
@@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { | |||
853 | .dentry_ops = &nfs_dentry_operations, | 853 | .dentry_ops = &nfs_dentry_operations, |
854 | .dir_inode_ops = &nfs3_dir_inode_operations, | 854 | .dir_inode_ops = &nfs3_dir_inode_operations, |
855 | .file_inode_ops = &nfs3_file_inode_operations, | 855 | .file_inode_ops = &nfs3_file_inode_operations, |
856 | .file_ops = &nfs_file_operations, | ||
856 | .getroot = nfs3_proc_get_root, | 857 | .getroot = nfs3_proc_get_root, |
857 | .getattr = nfs3_proc_getattr, | 858 | .getattr = nfs3_proc_getattr, |
858 | .setattr = nfs3_proc_setattr, | 859 | .setattr = nfs3_proc_setattr, |
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 09119418402..a62d36b9a99 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c | |||
@@ -31,6 +31,7 @@ | |||
31 | 31 | ||
32 | #include <linux/nfs_fs.h> | 32 | #include <linux/nfs_fs.h> |
33 | #include <linux/nfs_page.h> | 33 | #include <linux/nfs_page.h> |
34 | #include <linux/module.h> | ||
34 | 35 | ||
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | #include "nfs4filelayout.h" | 37 | #include "nfs4filelayout.h" |
@@ -449,9 +450,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, | |||
449 | 450 | ||
450 | fl->dsaddr = dsaddr; | 451 | fl->dsaddr = dsaddr; |
451 | 452 | ||
452 | if (fl->first_stripe_index < 0 || | 453 | if (fl->first_stripe_index >= dsaddr->stripe_count) { |
453 | fl->first_stripe_index >= dsaddr->stripe_count) { | 454 | dprintk("%s Bad first_stripe_index %u\n", |
454 | dprintk("%s Bad first_stripe_index %d\n", | ||
455 | __func__, fl->first_stripe_index); | 455 | __func__, fl->first_stripe_index); |
456 | goto out_put; | 456 | goto out_put; |
457 | } | 457 | } |
@@ -552,7 +552,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, | |||
552 | 552 | ||
553 | /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. | 553 | /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. |
554 | * Futher checking is done in filelayout_check_layout */ | 554 | * Futher checking is done in filelayout_check_layout */ |
555 | if (fl->num_fh < 0 || fl->num_fh > | 555 | if (fl->num_fh > |
556 | max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) | 556 | max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) |
557 | goto out_err; | 557 | goto out_err; |
558 | 558 | ||
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2ae413c986..be2bbac1381 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c | |||
@@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst | |||
2464 | case -NFS4ERR_BADNAME: | 2464 | case -NFS4ERR_BADNAME: |
2465 | return -ENOENT; | 2465 | return -ENOENT; |
2466 | case -NFS4ERR_MOVED: | 2466 | case -NFS4ERR_MOVED: |
2467 | err = nfs4_get_referral(dir, name, fattr, fhandle); | 2467 | return nfs4_get_referral(dir, name, fattr, fhandle); |
2468 | break; | ||
2469 | case -NFS4ERR_WRONGSEC: | 2468 | case -NFS4ERR_WRONGSEC: |
2470 | nfs_fixup_secinfo_attributes(fattr, fhandle); | 2469 | nfs_fixup_secinfo_attributes(fattr, fhandle); |
2471 | } | 2470 | } |
@@ -5950,6 +5949,7 @@ static void nfs4_layoutcommit_release(void *calldata) | |||
5950 | { | 5949 | { |
5951 | struct nfs4_layoutcommit_data *data = calldata; | 5950 | struct nfs4_layoutcommit_data *data = calldata; |
5952 | struct pnfs_layout_segment *lseg, *tmp; | 5951 | struct pnfs_layout_segment *lseg, *tmp; |
5952 | unsigned long *bitlock = &NFS_I(data->args.inode)->flags; | ||
5953 | 5953 | ||
5954 | pnfs_cleanup_layoutcommit(data); | 5954 | pnfs_cleanup_layoutcommit(data); |
5955 | /* Matched by references in pnfs_set_layoutcommit */ | 5955 | /* Matched by references in pnfs_set_layoutcommit */ |
@@ -5959,6 +5959,11 @@ static void nfs4_layoutcommit_release(void *calldata) | |||
5959 | &lseg->pls_flags)) | 5959 | &lseg->pls_flags)) |
5960 | put_lseg(lseg); | 5960 | put_lseg(lseg); |
5961 | } | 5961 | } |
5962 | |||
5963 | clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); | ||
5964 | smp_mb__after_clear_bit(); | ||
5965 | wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); | ||
5966 | |||
5962 | put_rpccred(data->cred); | 5967 | put_rpccred(data->cred); |
5963 | kfree(data); | 5968 | kfree(data); |
5964 | } | 5969 | } |
@@ -6247,6 +6252,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { | |||
6247 | .dentry_ops = &nfs4_dentry_operations, | 6252 | .dentry_ops = &nfs4_dentry_operations, |
6248 | .dir_inode_ops = &nfs4_dir_inode_operations, | 6253 | .dir_inode_ops = &nfs4_dir_inode_operations, |
6249 | .file_inode_ops = &nfs4_file_inode_operations, | 6254 | .file_inode_ops = &nfs4_file_inode_operations, |
6255 | .file_ops = &nfs4_file_operations, | ||
6250 | .getroot = nfs4_proc_get_root, | 6256 | .getroot = nfs4_proc_get_root, |
6251 | .getattr = nfs4_proc_getattr, | 6257 | .getattr = nfs4_proc_getattr, |
6252 | .setattr = nfs4_proc_setattr, | 6258 | .setattr = nfs4_proc_setattr, |
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1dce12f41a4..e6161b213ed 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c | |||
@@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, | |||
6602 | if (status) | 6602 | if (status) |
6603 | goto out; | 6603 | goto out; |
6604 | status = decode_secinfo(xdr, res); | 6604 | status = decode_secinfo(xdr, res); |
6605 | if (status) | ||
6606 | goto out; | ||
6607 | out: | 6605 | out: |
6608 | return status; | 6606 | return status; |
6609 | } | 6607 | } |
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index d0cda12fddc..c807ab93140 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c | |||
@@ -38,21 +38,15 @@ | |||
38 | */ | 38 | */ |
39 | 39 | ||
40 | #include <linux/module.h> | 40 | #include <linux/module.h> |
41 | #include <scsi/osd_initiator.h> | 41 | #include <scsi/osd_ore.h> |
42 | 42 | ||
43 | #include "objlayout.h" | 43 | #include "objlayout.h" |
44 | 44 | ||
45 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD | 45 | #define NFSDBG_FACILITY NFSDBG_PNFS_LD |
46 | 46 | ||
47 | #define _LLU(x) ((unsigned long long)x) | ||
48 | |||
49 | enum { BIO_MAX_PAGES_KMALLOC = | ||
50 | (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), | ||
51 | }; | ||
52 | |||
53 | struct objio_dev_ent { | 47 | struct objio_dev_ent { |
54 | struct nfs4_deviceid_node id_node; | 48 | struct nfs4_deviceid_node id_node; |
55 | struct osd_dev *od; | 49 | struct ore_dev od; |
56 | }; | 50 | }; |
57 | 51 | ||
58 | static void | 52 | static void |
@@ -60,8 +54,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) | |||
60 | { | 54 | { |
61 | struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); | 55 | struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); |
62 | 56 | ||
63 | dprintk("%s: free od=%p\n", __func__, de->od); | 57 | dprintk("%s: free od=%p\n", __func__, de->od.od); |
64 | osduld_put_device(de->od); | 58 | osduld_put_device(de->od.od); |
65 | kfree(de); | 59 | kfree(de); |
66 | } | 60 | } |
67 | 61 | ||
@@ -98,12 +92,12 @@ _dev_list_add(const struct nfs_server *nfss, | |||
98 | nfss->pnfs_curr_ld, | 92 | nfss->pnfs_curr_ld, |
99 | nfss->nfs_client, | 93 | nfss->nfs_client, |
100 | d_id); | 94 | d_id); |
101 | de->od = od; | 95 | de->od.od = od; |
102 | 96 | ||
103 | d = nfs4_insert_deviceid_node(&de->id_node); | 97 | d = nfs4_insert_deviceid_node(&de->id_node); |
104 | n = container_of(d, struct objio_dev_ent, id_node); | 98 | n = container_of(d, struct objio_dev_ent, id_node); |
105 | if (n != de) { | 99 | if (n != de) { |
106 | dprintk("%s: Race with other n->od=%p\n", __func__, n->od); | 100 | dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); |
107 | objio_free_deviceid_node(&de->id_node); | 101 | objio_free_deviceid_node(&de->id_node); |
108 | de = n; | 102 | de = n; |
109 | } | 103 | } |
@@ -111,28 +105,11 @@ _dev_list_add(const struct nfs_server *nfss, | |||
111 | return de; | 105 | return de; |
112 | } | 106 | } |
113 | 107 | ||
114 | struct caps_buffers { | ||
115 | u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; | ||
116 | u8 creds[OSD_CAP_LEN]; | ||
117 | }; | ||
118 | |||
119 | struct objio_segment { | 108 | struct objio_segment { |
120 | struct pnfs_layout_segment lseg; | 109 | struct pnfs_layout_segment lseg; |
121 | 110 | ||
122 | struct pnfs_osd_object_cred *comps; | 111 | struct ore_layout layout; |
123 | 112 | struct ore_components oc; | |
124 | unsigned mirrors_p1; | ||
125 | unsigned stripe_unit; | ||
126 | unsigned group_width; /* Data stripe_units without integrity comps */ | ||
127 | u64 group_depth; | ||
128 | unsigned group_count; | ||
129 | |||
130 | unsigned max_io_size; | ||
131 | |||
132 | unsigned comps_index; | ||
133 | unsigned num_comps; | ||
134 | /* variable length */ | ||
135 | struct objio_dev_ent *ods[]; | ||
136 | }; | 113 | }; |
137 | 114 | ||
138 | static inline struct objio_segment * | 115 | static inline struct objio_segment * |
@@ -141,59 +118,44 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) | |||
141 | return container_of(lseg, struct objio_segment, lseg); | 118 | return container_of(lseg, struct objio_segment, lseg); |
142 | } | 119 | } |
143 | 120 | ||
144 | struct objio_state; | ||
145 | typedef ssize_t (*objio_done_fn)(struct objio_state *ios); | ||
146 | |||
147 | struct objio_state { | 121 | struct objio_state { |
148 | /* Generic layer */ | 122 | /* Generic layer */ |
149 | struct objlayout_io_state ol_state; | 123 | struct objlayout_io_res oir; |
150 | 124 | ||
151 | struct objio_segment *layout; | 125 | bool sync; |
152 | 126 | /*FIXME: Support for extra_bytes at ore_get_rw_state() */ | |
153 | struct kref kref; | 127 | struct ore_io_state *ios; |
154 | objio_done_fn done; | ||
155 | void *private; | ||
156 | |||
157 | unsigned long length; | ||
158 | unsigned numdevs; /* Actually used devs in this IO */ | ||
159 | /* A per-device variable array of size numdevs */ | ||
160 | struct _objio_per_comp { | ||
161 | struct bio *bio; | ||
162 | struct osd_request *or; | ||
163 | unsigned long length; | ||
164 | u64 offset; | ||
165 | unsigned dev; | ||
166 | } per_dev[]; | ||
167 | }; | 128 | }; |
168 | 129 | ||
169 | /* Send and wait for a get_device_info of devices in the layout, | 130 | /* Send and wait for a get_device_info of devices in the layout, |
170 | then look them up with the osd_initiator library */ | 131 | then look them up with the osd_initiator library */ |
171 | static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, | 132 | static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, |
172 | struct objio_segment *objio_seg, unsigned comp, | 133 | struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, |
173 | gfp_t gfp_flags) | 134 | gfp_t gfp_flags) |
174 | { | 135 | { |
175 | struct pnfs_osd_deviceaddr *deviceaddr; | 136 | struct pnfs_osd_deviceaddr *deviceaddr; |
176 | struct nfs4_deviceid *d_id; | ||
177 | struct objio_dev_ent *ode; | 137 | struct objio_dev_ent *ode; |
178 | struct osd_dev *od; | 138 | struct osd_dev *od; |
179 | struct osd_dev_info odi; | 139 | struct osd_dev_info odi; |
180 | int err; | 140 | int err; |
181 | 141 | ||
182 | d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; | ||
183 | |||
184 | ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); | 142 | ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); |
185 | if (ode) | 143 | if (ode) { |
186 | return ode; | 144 | objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ |
145 | return 0; | ||
146 | } | ||
187 | 147 | ||
188 | err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); | 148 | err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); |
189 | if (unlikely(err)) { | 149 | if (unlikely(err)) { |
190 | dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", | 150 | dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", |
191 | __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); | 151 | __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); |
192 | return ERR_PTR(err); | 152 | return err; |
193 | } | 153 | } |
194 | 154 | ||
195 | odi.systemid_len = deviceaddr->oda_systemid.len; | 155 | odi.systemid_len = deviceaddr->oda_systemid.len; |
196 | if (odi.systemid_len > sizeof(odi.systemid)) { | 156 | if (odi.systemid_len > sizeof(odi.systemid)) { |
157 | dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", | ||
158 | __func__, sizeof(odi.systemid)); | ||
197 | err = -EINVAL; | 159 | err = -EINVAL; |
198 | goto out; | 160 | goto out; |
199 | } else if (odi.systemid_len) | 161 | } else if (odi.systemid_len) |
@@ -218,96 +180,53 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, | |||
218 | 180 | ||
219 | ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, | 181 | ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, |
220 | gfp_flags); | 182 | gfp_flags); |
221 | 183 | objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ | |
184 | dprintk("Adding new dev_id(%llx:%llx)\n", | ||
185 | _DEVID_LO(d_id), _DEVID_HI(d_id)); | ||
222 | out: | 186 | out: |
223 | dprintk("%s: return=%d\n", __func__, err); | ||
224 | objlayout_put_deviceinfo(deviceaddr); | 187 | objlayout_put_deviceinfo(deviceaddr); |
225 | return err ? ERR_PTR(err) : ode; | 188 | return err; |
226 | } | 189 | } |
227 | 190 | ||
228 | static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, | 191 | static void copy_single_comp(struct ore_components *oc, unsigned c, |
229 | struct objio_segment *objio_seg, | 192 | struct pnfs_osd_object_cred *src_comp) |
230 | gfp_t gfp_flags) | ||
231 | { | 193 | { |
232 | unsigned i; | 194 | struct ore_comp *ocomp = &oc->comps[c]; |
233 | int err; | ||
234 | 195 | ||
235 | /* lookup all devices */ | 196 | WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ |
236 | for (i = 0; i < objio_seg->num_comps; i++) { | 197 | WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); |
237 | struct objio_dev_ent *ode; | ||
238 | 198 | ||
239 | ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); | 199 | ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; |
240 | if (unlikely(IS_ERR(ode))) { | 200 | ocomp->obj.id = src_comp->oc_object_id.oid_object_id; |
241 | err = PTR_ERR(ode); | ||
242 | goto out; | ||
243 | } | ||
244 | objio_seg->ods[i] = ode; | ||
245 | } | ||
246 | err = 0; | ||
247 | 201 | ||
248 | out: | 202 | memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); |
249 | dprintk("%s: return=%d\n", __func__, err); | ||
250 | return err; | ||
251 | } | 203 | } |
252 | 204 | ||
253 | static int _verify_data_map(struct pnfs_osd_layout *layout) | 205 | int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, |
206 | struct objio_segment **pseg) | ||
254 | { | 207 | { |
255 | struct pnfs_osd_data_map *data_map = &layout->olo_map; | 208 | struct __alloc_objio_segment { |
256 | u64 stripe_length; | 209 | struct objio_segment olseg; |
257 | u32 group_width; | 210 | struct ore_dev *ods[numdevs]; |
258 | 211 | struct ore_comp comps[numdevs]; | |
259 | /* FIXME: Only raid0 for now. if not go through MDS */ | 212 | } *aolseg; |
260 | if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { | ||
261 | printk(KERN_ERR "Only RAID_0 for now\n"); | ||
262 | return -ENOTSUPP; | ||
263 | } | ||
264 | if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { | ||
265 | printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", | ||
266 | data_map->odm_num_comps, data_map->odm_mirror_cnt); | ||
267 | return -EINVAL; | ||
268 | } | ||
269 | 213 | ||
270 | if (data_map->odm_group_width) | 214 | aolseg = kzalloc(sizeof(*aolseg), gfp_flags); |
271 | group_width = data_map->odm_group_width; | 215 | if (unlikely(!aolseg)) { |
272 | else | 216 | dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, |
273 | group_width = data_map->odm_num_comps / | 217 | numdevs, sizeof(*aolseg)); |
274 | (data_map->odm_mirror_cnt + 1); | 218 | return -ENOMEM; |
275 | |||
276 | stripe_length = (u64)data_map->odm_stripe_unit * group_width; | ||
277 | if (stripe_length >= (1ULL << 32)) { | ||
278 | printk(KERN_ERR "Total Stripe length(0x%llx)" | ||
279 | " >= 32bit is not supported\n", _LLU(stripe_length)); | ||
280 | return -ENOTSUPP; | ||
281 | } | 219 | } |
282 | 220 | ||
283 | if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { | 221 | aolseg->olseg.oc.numdevs = numdevs; |
284 | printk(KERN_ERR "Stripe Unit(0x%llx)" | 222 | aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; |
285 | " must be Multples of PAGE_SIZE(0x%lx)\n", | 223 | aolseg->olseg.oc.comps = aolseg->comps; |
286 | _LLU(data_map->odm_stripe_unit), PAGE_SIZE); | 224 | aolseg->olseg.oc.ods = aolseg->ods; |
287 | return -ENOTSUPP; | ||
288 | } | ||
289 | 225 | ||
226 | *pseg = &aolseg->olseg; | ||
290 | return 0; | 227 | return 0; |
291 | } | 228 | } |
292 | 229 | ||
293 | static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, | ||
294 | struct pnfs_osd_object_cred *src_comp, | ||
295 | struct caps_buffers *caps_p) | ||
296 | { | ||
297 | WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); | ||
298 | WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); | ||
299 | |||
300 | *cur_comp = *src_comp; | ||
301 | |||
302 | memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, | ||
303 | sizeof(caps_p->caps_key)); | ||
304 | cur_comp->oc_cap_key.cred = caps_p->caps_key; | ||
305 | |||
306 | memcpy(caps_p->creds, src_comp->oc_cap.cred, | ||
307 | sizeof(caps_p->creds)); | ||
308 | cur_comp->oc_cap.cred = caps_p->creds; | ||
309 | } | ||
310 | |||
311 | int objio_alloc_lseg(struct pnfs_layout_segment **outp, | 230 | int objio_alloc_lseg(struct pnfs_layout_segment **outp, |
312 | struct pnfs_layout_hdr *pnfslay, | 231 | struct pnfs_layout_hdr *pnfslay, |
313 | struct pnfs_layout_range *range, | 232 | struct pnfs_layout_range *range, |
@@ -317,59 +236,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, | |||
317 | struct objio_segment *objio_seg; | 236 | struct objio_segment *objio_seg; |
318 | struct pnfs_osd_xdr_decode_layout_iter iter; | 237 | struct pnfs_osd_xdr_decode_layout_iter iter; |
319 | struct pnfs_osd_layout layout; | 238 | struct pnfs_osd_layout layout; |
320 | struct pnfs_osd_object_cred *cur_comp, src_comp; | 239 | struct pnfs_osd_object_cred src_comp; |
321 | struct caps_buffers *caps_p; | 240 | unsigned cur_comp; |
322 | int err; | 241 | int err; |
323 | 242 | ||
324 | err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); | 243 | err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); |
325 | if (unlikely(err)) | 244 | if (unlikely(err)) |
326 | return err; | 245 | return err; |
327 | 246 | ||
328 | err = _verify_data_map(&layout); | 247 | err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); |
329 | if (unlikely(err)) | 248 | if (unlikely(err)) |
330 | return err; | 249 | return err; |
331 | 250 | ||
332 | objio_seg = kzalloc(sizeof(*objio_seg) + | 251 | objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; |
333 | sizeof(objio_seg->ods[0]) * layout.olo_num_comps + | 252 | objio_seg->layout.group_width = layout.olo_map.odm_group_width; |
334 | sizeof(*objio_seg->comps) * layout.olo_num_comps + | 253 | objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; |
335 | sizeof(struct caps_buffers) * layout.olo_num_comps, | 254 | objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; |
336 | gfp_flags); | 255 | objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; |
337 | if (!objio_seg) | ||
338 | return -ENOMEM; | ||
339 | 256 | ||
340 | objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); | 257 | err = ore_verify_layout(layout.olo_map.odm_num_comps, |
341 | cur_comp = objio_seg->comps; | 258 | &objio_seg->layout); |
342 | caps_p = (void *)(cur_comp + layout.olo_num_comps); | ||
343 | while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) | ||
344 | copy_single_comp(cur_comp++, &src_comp, caps_p++); | ||
345 | if (unlikely(err)) | 259 | if (unlikely(err)) |
346 | goto err; | 260 | goto err; |
347 | 261 | ||
348 | objio_seg->num_comps = layout.olo_num_comps; | 262 | objio_seg->oc.first_dev = layout.olo_comps_index; |
349 | objio_seg->comps_index = layout.olo_comps_index; | 263 | cur_comp = 0; |
350 | err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); | 264 | while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { |
351 | if (err) | 265 | copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); |
352 | goto err; | 266 | err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, |
353 | 267 | &src_comp.oc_object_id.oid_device_id, | |
354 | objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; | 268 | gfp_flags); |
355 | objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; | 269 | if (err) |
356 | if (layout.olo_map.odm_group_width) { | 270 | goto err; |
357 | objio_seg->group_width = layout.olo_map.odm_group_width; | 271 | ++cur_comp; |
358 | objio_seg->group_depth = layout.olo_map.odm_group_depth; | ||
359 | objio_seg->group_count = layout.olo_map.odm_num_comps / | ||
360 | objio_seg->mirrors_p1 / | ||
361 | objio_seg->group_width; | ||
362 | } else { | ||
363 | objio_seg->group_width = layout.olo_map.odm_num_comps / | ||
364 | objio_seg->mirrors_p1; | ||
365 | objio_seg->group_depth = -1; | ||
366 | objio_seg->group_count = 1; | ||
367 | } | 272 | } |
368 | 273 | /* pnfs_osd_xdr_decode_layout_comp returns false on error */ | |
369 | /* Cache this calculation it will hit for every page */ | 274 | if (unlikely(err)) |
370 | objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - | 275 | goto err; |
371 | objio_seg->stripe_unit) * | ||
372 | objio_seg->group_width; | ||
373 | 276 | ||
374 | *outp = &objio_seg->lseg; | 277 | *outp = &objio_seg->lseg; |
375 | return 0; | 278 | return 0; |
@@ -386,43 +289,63 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) | |||
386 | int i; | 289 | int i; |
387 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); | 290 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); |
388 | 291 | ||
389 | for (i = 0; i < objio_seg->num_comps; i++) { | 292 | for (i = 0; i < objio_seg->oc.numdevs; i++) { |
390 | if (!objio_seg->ods[i]) | 293 | struct ore_dev *od = objio_seg->oc.ods[i]; |
294 | struct objio_dev_ent *ode; | ||
295 | |||
296 | if (!od) | ||
391 | break; | 297 | break; |
392 | nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); | 298 | ode = container_of(od, typeof(*ode), od); |
299 | nfs4_put_deviceid_node(&ode->id_node); | ||
393 | } | 300 | } |
394 | kfree(objio_seg); | 301 | kfree(objio_seg); |
395 | } | 302 | } |
396 | 303 | ||
397 | int objio_alloc_io_state(struct pnfs_layout_segment *lseg, | 304 | static int |
398 | struct objlayout_io_state **outp, | 305 | objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, |
399 | gfp_t gfp_flags) | 306 | struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, |
307 | loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, | ||
308 | struct objio_state **outp) | ||
400 | { | 309 | { |
401 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); | 310 | struct objio_segment *objio_seg = OBJIO_LSEG(lseg); |
402 | struct objio_state *ios; | 311 | struct ore_io_state *ios; |
403 | const unsigned first_size = sizeof(*ios) + | 312 | int ret; |
404 | objio_seg->num_comps * sizeof(ios->per_dev[0]); | 313 | struct __alloc_objio_state { |
405 | const unsigned sec_size = objio_seg->num_comps * | 314 | struct objio_state objios; |
406 | sizeof(ios->ol_state.ioerrs[0]); | 315 | struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; |
407 | 316 | } *aos; | |
408 | ios = kzalloc(first_size + sec_size, gfp_flags); | 317 | |
409 | if (unlikely(!ios)) | 318 | aos = kzalloc(sizeof(*aos), gfp_flags); |
319 | if (unlikely(!aos)) | ||
410 | return -ENOMEM; | 320 | return -ENOMEM; |
411 | 321 | ||
412 | ios->layout = objio_seg; | 322 | objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, |
413 | ios->ol_state.ioerrs = ((void *)ios) + first_size; | 323 | aos->ioerrs, rpcdata, pnfs_layout_type); |
414 | ios->ol_state.num_comps = objio_seg->num_comps; | ||
415 | 324 | ||
416 | *outp = &ios->ol_state; | 325 | ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, |
326 | offset, count, &ios); | ||
327 | if (unlikely(ret)) { | ||
328 | kfree(aos); | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | ios->pages = pages; | ||
333 | ios->pgbase = pgbase; | ||
334 | ios->private = aos; | ||
335 | BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); | ||
336 | |||
337 | aos->objios.sync = 0; | ||
338 | aos->objios.ios = ios; | ||
339 | *outp = &aos->objios; | ||
417 | return 0; | 340 | return 0; |
418 | } | 341 | } |
419 | 342 | ||
420 | void objio_free_io_state(struct objlayout_io_state *ol_state) | 343 | void objio_free_result(struct objlayout_io_res *oir) |
421 | { | 344 | { |
422 | struct objio_state *ios = container_of(ol_state, struct objio_state, | 345 | struct objio_state *objios = container_of(oir, struct objio_state, oir); |
423 | ol_state); | ||
424 | 346 | ||
425 | kfree(ios); | 347 | ore_put_io_state(objios->ios); |
348 | kfree(objios); | ||
426 | } | 349 | } |
427 | 350 | ||
428 | enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) | 351 | enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) |
@@ -455,539 +378,152 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) | |||
455 | } | 378 | } |
456 | } | 379 | } |
457 | 380 | ||
458 | static void _clear_bio(struct bio *bio) | 381 | static void __on_dev_error(struct ore_io_state *ios, |
382 | struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, | ||
383 | u64 dev_offset, u64 dev_len) | ||
459 | { | 384 | { |
460 | struct bio_vec *bv; | 385 | struct objio_state *objios = ios->private; |
461 | unsigned i; | 386 | struct pnfs_osd_objid pooid; |
462 | 387 | struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); | |
463 | __bio_for_each_segment(bv, bio, i, 0) { | 388 | /* FIXME: what to do with more-then-one-group layouts. We need to |
464 | unsigned this_count = bv->bv_len; | 389 | * translate from ore_io_state index to oc->comps index |
465 | 390 | */ | |
466 | if (likely(PAGE_SIZE == this_count)) | 391 | unsigned comp = dev_index; |
467 | clear_highpage(bv->bv_page); | ||
468 | else | ||
469 | zero_user(bv->bv_page, bv->bv_offset, this_count); | ||
470 | } | ||
471 | } | ||
472 | |||
473 | static int _io_check(struct objio_state *ios, bool is_write) | ||
474 | { | ||
475 | enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; | ||
476 | int lin_ret = 0; | ||
477 | int i; | ||
478 | |||
479 | for (i = 0; i < ios->numdevs; i++) { | ||
480 | struct osd_sense_info osi; | ||
481 | struct osd_request *or = ios->per_dev[i].or; | ||
482 | int ret; | ||
483 | |||
484 | if (!or) | ||
485 | continue; | ||
486 | 392 | ||
487 | ret = osd_req_decode_sense(or, &osi); | 393 | pooid.oid_device_id = ode->id_node.deviceid; |
488 | if (likely(!ret)) | 394 | pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; |
489 | continue; | 395 | pooid.oid_object_id = ios->oc->comps[comp].obj.id; |
490 | 396 | ||
491 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | 397 | objlayout_io_set_result(&objios->oir, comp, |
492 | /* start read offset passed endof file */ | 398 | &pooid, osd_pri_2_pnfs_err(oep), |
493 | BUG_ON(is_write); | 399 | dev_offset, dev_len, !ios->reading); |
494 | _clear_bio(ios->per_dev[i].bio); | ||
495 | dprintk("%s: start read offset passed end of file " | ||
496 | "offset=0x%llx, length=0x%lx\n", __func__, | ||
497 | _LLU(ios->per_dev[i].offset), | ||
498 | ios->per_dev[i].length); | ||
499 | |||
500 | continue; /* we recovered */ | ||
501 | } | ||
502 | objlayout_io_set_result(&ios->ol_state, i, | ||
503 | &ios->layout->comps[i].oc_object_id, | ||
504 | osd_pri_2_pnfs_err(osi.osd_err_pri), | ||
505 | ios->per_dev[i].offset, | ||
506 | ios->per_dev[i].length, | ||
507 | is_write); | ||
508 | |||
509 | if (osi.osd_err_pri >= oep) { | ||
510 | oep = osi.osd_err_pri; | ||
511 | lin_ret = ret; | ||
512 | } | ||
513 | } | ||
514 | |||
515 | return lin_ret; | ||
516 | } | ||
517 | |||
518 | /* | ||
519 | * Common IO state helpers. | ||
520 | */ | ||
521 | static void _io_free(struct objio_state *ios) | ||
522 | { | ||
523 | unsigned i; | ||
524 | |||
525 | for (i = 0; i < ios->numdevs; i++) { | ||
526 | struct _objio_per_comp *per_dev = &ios->per_dev[i]; | ||
527 | |||
528 | if (per_dev->or) { | ||
529 | osd_end_request(per_dev->or); | ||
530 | per_dev->or = NULL; | ||
531 | } | ||
532 | |||
533 | if (per_dev->bio) { | ||
534 | bio_put(per_dev->bio); | ||
535 | per_dev->bio = NULL; | ||
536 | } | ||
537 | } | ||
538 | } | ||
539 | |||
540 | struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) | ||
541 | { | ||
542 | unsigned min_dev = ios->layout->comps_index; | ||
543 | unsigned max_dev = min_dev + ios->layout->num_comps; | ||
544 | |||
545 | BUG_ON(dev < min_dev || max_dev <= dev); | ||
546 | return ios->layout->ods[dev - min_dev]->od; | ||
547 | } | ||
548 | |||
549 | struct _striping_info { | ||
550 | u64 obj_offset; | ||
551 | u64 group_length; | ||
552 | unsigned dev; | ||
553 | unsigned unit_off; | ||
554 | }; | ||
555 | |||
556 | static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, | ||
557 | struct _striping_info *si) | ||
558 | { | ||
559 | u32 stripe_unit = ios->layout->stripe_unit; | ||
560 | u32 group_width = ios->layout->group_width; | ||
561 | u64 group_depth = ios->layout->group_depth; | ||
562 | u32 U = stripe_unit * group_width; | ||
563 | |||
564 | u64 T = U * group_depth; | ||
565 | u64 S = T * ios->layout->group_count; | ||
566 | u64 M = div64_u64(file_offset, S); | ||
567 | |||
568 | /* | ||
569 | G = (L - (M * S)) / T | ||
570 | H = (L - (M * S)) % T | ||
571 | */ | ||
572 | u64 LmodU = file_offset - M * S; | ||
573 | u32 G = div64_u64(LmodU, T); | ||
574 | u64 H = LmodU - G * T; | ||
575 | |||
576 | u32 N = div_u64(H, U); | ||
577 | |||
578 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | ||
579 | si->obj_offset = si->unit_off + (N * stripe_unit) + | ||
580 | (M * group_depth * stripe_unit); | ||
581 | |||
582 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | ||
583 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | ||
584 | si->dev *= ios->layout->mirrors_p1; | ||
585 | |||
586 | si->group_length = T - H; | ||
587 | } | ||
588 | |||
589 | static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, | ||
590 | unsigned pgbase, struct _objio_per_comp *per_dev, int len, | ||
591 | gfp_t gfp_flags) | ||
592 | { | ||
593 | unsigned pg = *cur_pg; | ||
594 | int cur_len = len; | ||
595 | struct request_queue *q = | ||
596 | osd_request_queue(_io_od(ios, per_dev->dev)); | ||
597 | |||
598 | if (per_dev->bio == NULL) { | ||
599 | unsigned pages_in_stripe = ios->layout->group_width * | ||
600 | (ios->layout->stripe_unit / PAGE_SIZE); | ||
601 | unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / | ||
602 | ios->layout->group_width; | ||
603 | |||
604 | if (BIO_MAX_PAGES_KMALLOC < bio_size) | ||
605 | bio_size = BIO_MAX_PAGES_KMALLOC; | ||
606 | |||
607 | per_dev->bio = bio_kmalloc(gfp_flags, bio_size); | ||
608 | if (unlikely(!per_dev->bio)) { | ||
609 | dprintk("Faild to allocate BIO size=%u\n", bio_size); | ||
610 | return -ENOMEM; | ||
611 | } | ||
612 | } | ||
613 | |||
614 | while (cur_len > 0) { | ||
615 | unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); | ||
616 | unsigned added_len; | ||
617 | |||
618 | BUG_ON(ios->ol_state.nr_pages <= pg); | ||
619 | cur_len -= pglen; | ||
620 | |||
621 | added_len = bio_add_pc_page(q, per_dev->bio, | ||
622 | ios->ol_state.pages[pg], pglen, pgbase); | ||
623 | if (unlikely(pglen != added_len)) | ||
624 | return -ENOMEM; | ||
625 | pgbase = 0; | ||
626 | ++pg; | ||
627 | } | ||
628 | BUG_ON(cur_len); | ||
629 | |||
630 | per_dev->length += len; | ||
631 | *cur_pg = pg; | ||
632 | return 0; | ||
633 | } | ||
634 | |||
635 | static int _prepare_one_group(struct objio_state *ios, u64 length, | ||
636 | struct _striping_info *si, unsigned *last_pg, | ||
637 | gfp_t gfp_flags) | ||
638 | { | ||
639 | unsigned stripe_unit = ios->layout->stripe_unit; | ||
640 | unsigned mirrors_p1 = ios->layout->mirrors_p1; | ||
641 | unsigned devs_in_group = ios->layout->group_width * mirrors_p1; | ||
642 | unsigned dev = si->dev; | ||
643 | unsigned first_dev = dev - (dev % devs_in_group); | ||
644 | unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; | ||
645 | unsigned cur_pg = *last_pg; | ||
646 | int ret = 0; | ||
647 | |||
648 | while (length) { | ||
649 | struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; | ||
650 | unsigned cur_len, page_off = 0; | ||
651 | |||
652 | if (!per_dev->length) { | ||
653 | per_dev->dev = dev; | ||
654 | if (dev < si->dev) { | ||
655 | per_dev->offset = si->obj_offset + stripe_unit - | ||
656 | si->unit_off; | ||
657 | cur_len = stripe_unit; | ||
658 | } else if (dev == si->dev) { | ||
659 | per_dev->offset = si->obj_offset; | ||
660 | cur_len = stripe_unit - si->unit_off; | ||
661 | page_off = si->unit_off & ~PAGE_MASK; | ||
662 | BUG_ON(page_off && | ||
663 | (page_off != ios->ol_state.pgbase)); | ||
664 | } else { /* dev > si->dev */ | ||
665 | per_dev->offset = si->obj_offset - si->unit_off; | ||
666 | cur_len = stripe_unit; | ||
667 | } | ||
668 | |||
669 | if (max_comp < dev - first_dev) | ||
670 | max_comp = dev - first_dev; | ||
671 | } else { | ||
672 | cur_len = stripe_unit; | ||
673 | } | ||
674 | if (cur_len >= length) | ||
675 | cur_len = length; | ||
676 | |||
677 | ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, | ||
678 | cur_len, gfp_flags); | ||
679 | if (unlikely(ret)) | ||
680 | goto out; | ||
681 | |||
682 | dev += mirrors_p1; | ||
683 | dev = (dev % devs_in_group) + first_dev; | ||
684 | |||
685 | length -= cur_len; | ||
686 | ios->length += cur_len; | ||
687 | } | ||
688 | out: | ||
689 | ios->numdevs = max_comp + mirrors_p1; | ||
690 | *last_pg = cur_pg; | ||
691 | return ret; | ||
692 | } | ||
693 | |||
694 | static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) | ||
695 | { | ||
696 | u64 length = ios->ol_state.count; | ||
697 | u64 offset = ios->ol_state.offset; | ||
698 | struct _striping_info si; | ||
699 | unsigned last_pg = 0; | ||
700 | int ret = 0; | ||
701 | |||
702 | while (length) { | ||
703 | _calc_stripe_info(ios, offset, &si); | ||
704 | |||
705 | if (length < si.group_length) | ||
706 | si.group_length = length; | ||
707 | |||
708 | ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); | ||
709 | if (unlikely(ret)) | ||
710 | goto out; | ||
711 | |||
712 | offset += si.group_length; | ||
713 | length -= si.group_length; | ||
714 | } | ||
715 | |||
716 | out: | ||
717 | if (!ios->length) | ||
718 | return ret; | ||
719 | |||
720 | return 0; | ||
721 | } | ||
722 | |||
723 | static ssize_t _sync_done(struct objio_state *ios) | ||
724 | { | ||
725 | struct completion *waiting = ios->private; | ||
726 | |||
727 | complete(waiting); | ||
728 | return 0; | ||
729 | } | ||
730 | |||
731 | static void _last_io(struct kref *kref) | ||
732 | { | ||
733 | struct objio_state *ios = container_of(kref, struct objio_state, kref); | ||
734 | |||
735 | ios->done(ios); | ||
736 | } | ||
737 | |||
738 | static void _done_io(struct osd_request *or, void *p) | ||
739 | { | ||
740 | struct objio_state *ios = p; | ||
741 | |||
742 | kref_put(&ios->kref, _last_io); | ||
743 | } | ||
744 | |||
745 | static ssize_t _io_exec(struct objio_state *ios) | ||
746 | { | ||
747 | DECLARE_COMPLETION_ONSTACK(wait); | ||
748 | ssize_t status = 0; /* sync status */ | ||
749 | unsigned i; | ||
750 | objio_done_fn saved_done_fn = ios->done; | ||
751 | bool sync = ios->ol_state.sync; | ||
752 | |||
753 | if (sync) { | ||
754 | ios->done = _sync_done; | ||
755 | ios->private = &wait; | ||
756 | } | ||
757 | |||
758 | kref_init(&ios->kref); | ||
759 | |||
760 | for (i = 0; i < ios->numdevs; i++) { | ||
761 | struct osd_request *or = ios->per_dev[i].or; | ||
762 | |||
763 | if (!or) | ||
764 | continue; | ||
765 | |||
766 | kref_get(&ios->kref); | ||
767 | osd_execute_request_async(or, _done_io, ios); | ||
768 | } | ||
769 | |||
770 | kref_put(&ios->kref, _last_io); | ||
771 | |||
772 | if (sync) { | ||
773 | wait_for_completion(&wait); | ||
774 | status = saved_done_fn(ios); | ||
775 | } | ||
776 | |||
777 | return status; | ||
778 | } | 400 | } |
779 | 401 | ||
780 | /* | 402 | /* |
781 | * read | 403 | * read |
782 | */ | 404 | */ |
783 | static ssize_t _read_done(struct objio_state *ios) | 405 | static void _read_done(struct ore_io_state *ios, void *private) |
784 | { | 406 | { |
407 | struct objio_state *objios = private; | ||
785 | ssize_t status; | 408 | ssize_t status; |
786 | int ret = _io_check(ios, false); | 409 | int ret = ore_check_io(ios, &__on_dev_error); |
787 | 410 | ||
788 | _io_free(ios); | 411 | /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ |
789 | 412 | ||
790 | if (likely(!ret)) | 413 | if (likely(!ret)) |
791 | status = ios->length; | 414 | status = ios->length; |
792 | else | 415 | else |
793 | status = ret; | 416 | status = ret; |
794 | 417 | ||
795 | objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); | 418 | objlayout_read_done(&objios->oir, status, objios->sync); |
796 | return status; | ||
797 | } | 419 | } |
798 | 420 | ||
799 | static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) | 421 | int objio_read_pagelist(struct nfs_read_data *rdata) |
800 | { | 422 | { |
801 | struct osd_request *or = NULL; | 423 | struct objio_state *objios; |
802 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | ||
803 | unsigned dev = per_dev->dev; | ||
804 | struct pnfs_osd_object_cred *cred = | ||
805 | &ios->layout->comps[cur_comp]; | ||
806 | struct osd_obj_id obj = { | ||
807 | .partition = cred->oc_object_id.oid_partition_id, | ||
808 | .id = cred->oc_object_id.oid_object_id, | ||
809 | }; | ||
810 | int ret; | 424 | int ret; |
811 | 425 | ||
812 | or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); | 426 | ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, |
813 | if (unlikely(!or)) { | 427 | rdata->lseg, rdata->args.pages, rdata->args.pgbase, |
814 | ret = -ENOMEM; | 428 | rdata->args.offset, rdata->args.count, rdata, |
815 | goto err; | 429 | GFP_KERNEL, &objios); |
816 | } | ||
817 | per_dev->or = or; | ||
818 | |||
819 | osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); | ||
820 | |||
821 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | ||
822 | if (ret) { | ||
823 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | ||
824 | __func__, ret); | ||
825 | goto err; | ||
826 | } | ||
827 | |||
828 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | ||
829 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | ||
830 | per_dev->length); | ||
831 | |||
832 | err: | ||
833 | return ret; | ||
834 | } | ||
835 | |||
836 | static ssize_t _read_exec(struct objio_state *ios) | ||
837 | { | ||
838 | unsigned i; | ||
839 | int ret; | ||
840 | |||
841 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | ||
842 | if (!ios->per_dev[i].length) | ||
843 | continue; | ||
844 | ret = _read_mirrors(ios, i); | ||
845 | if (unlikely(ret)) | ||
846 | goto err; | ||
847 | } | ||
848 | |||
849 | ios->done = _read_done; | ||
850 | return _io_exec(ios); /* In sync mode exec returns the io status */ | ||
851 | |||
852 | err: | ||
853 | _io_free(ios); | ||
854 | return ret; | ||
855 | } | ||
856 | |||
857 | ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) | ||
858 | { | ||
859 | struct objio_state *ios = container_of(ol_state, struct objio_state, | ||
860 | ol_state); | ||
861 | int ret; | ||
862 | |||
863 | ret = _io_rw_pagelist(ios, GFP_KERNEL); | ||
864 | if (unlikely(ret)) | 430 | if (unlikely(ret)) |
865 | return ret; | 431 | return ret; |
866 | 432 | ||
867 | return _read_exec(ios); | 433 | objios->ios->done = _read_done; |
434 | dprintk("%s: offset=0x%llx length=0x%x\n", __func__, | ||
435 | rdata->args.offset, rdata->args.count); | ||
436 | return ore_read(objios->ios); | ||
868 | } | 437 | } |
869 | 438 | ||
870 | /* | 439 | /* |
871 | * write | 440 | * write |
872 | */ | 441 | */ |
873 | static ssize_t _write_done(struct objio_state *ios) | 442 | static void _write_done(struct ore_io_state *ios, void *private) |
874 | { | 443 | { |
444 | struct objio_state *objios = private; | ||
875 | ssize_t status; | 445 | ssize_t status; |
876 | int ret = _io_check(ios, true); | 446 | int ret = ore_check_io(ios, &__on_dev_error); |
877 | 447 | ||
878 | _io_free(ios); | 448 | /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ |
879 | 449 | ||
880 | if (likely(!ret)) { | 450 | if (likely(!ret)) { |
881 | /* FIXME: should be based on the OSD's persistence model | 451 | /* FIXME: should be based on the OSD's persistence model |
882 | * See OSD2r05 Section 4.13 Data persistence model */ | 452 | * See OSD2r05 Section 4.13 Data persistence model */ |
883 | ios->ol_state.committed = NFS_FILE_SYNC; | 453 | objios->oir.committed = NFS_FILE_SYNC; |
884 | status = ios->length; | 454 | status = ios->length; |
885 | } else { | 455 | } else { |
886 | status = ret; | 456 | status = ret; |
887 | } | 457 | } |
888 | 458 | ||
889 | objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); | 459 | objlayout_write_done(&objios->oir, status, objios->sync); |
890 | return status; | ||
891 | } | 460 | } |
892 | 461 | ||
893 | static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) | 462 | static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) |
894 | { | 463 | { |
895 | struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; | 464 | struct objio_state *objios = priv; |
896 | unsigned dev = ios->per_dev[cur_comp].dev; | 465 | struct nfs_write_data *wdata = objios->oir.rpcdata; |
897 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | 466 | pgoff_t index = offset / PAGE_SIZE; |
898 | int ret; | 467 | struct page *page = find_get_page(wdata->inode->i_mapping, index); |
899 | |||
900 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { | ||
901 | struct osd_request *or = NULL; | ||
902 | struct pnfs_osd_object_cred *cred = | ||
903 | &ios->layout->comps[cur_comp]; | ||
904 | struct osd_obj_id obj = { | ||
905 | .partition = cred->oc_object_id.oid_partition_id, | ||
906 | .id = cred->oc_object_id.oid_object_id, | ||
907 | }; | ||
908 | struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; | ||
909 | struct bio *bio; | ||
910 | |||
911 | or = osd_start_request(_io_od(ios, dev), GFP_NOFS); | ||
912 | if (unlikely(!or)) { | ||
913 | ret = -ENOMEM; | ||
914 | goto err; | ||
915 | } | ||
916 | per_dev->or = or; | ||
917 | |||
918 | if (per_dev != master_dev) { | ||
919 | bio = bio_kmalloc(GFP_NOFS, | ||
920 | master_dev->bio->bi_max_vecs); | ||
921 | if (unlikely(!bio)) { | ||
922 | dprintk("Faild to allocate BIO size=%u\n", | ||
923 | master_dev->bio->bi_max_vecs); | ||
924 | ret = -ENOMEM; | ||
925 | goto err; | ||
926 | } | ||
927 | |||
928 | __bio_clone(bio, master_dev->bio); | ||
929 | bio->bi_bdev = NULL; | ||
930 | bio->bi_next = NULL; | ||
931 | per_dev->bio = bio; | ||
932 | per_dev->dev = dev; | ||
933 | per_dev->length = master_dev->length; | ||
934 | per_dev->offset = master_dev->offset; | ||
935 | } else { | ||
936 | bio = master_dev->bio; | ||
937 | bio->bi_rw |= REQ_WRITE; | ||
938 | } | ||
939 | |||
940 | osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); | ||
941 | 468 | ||
942 | ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); | 469 | if (!page) { |
943 | if (ret) { | 470 | page = find_or_create_page(wdata->inode->i_mapping, |
944 | dprintk("%s: Faild to osd_finalize_request() => %d\n", | 471 | index, GFP_NOFS); |
945 | __func__, ret); | 472 | if (unlikely(!page)) { |
946 | goto err; | 473 | dprintk("%s: grab_cache_page Failed index=0x%lx\n", |
474 | __func__, index); | ||
475 | return NULL; | ||
947 | } | 476 | } |
948 | 477 | unlock_page(page); | |
949 | dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", | ||
950 | __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), | ||
951 | per_dev->length); | ||
952 | } | 478 | } |
479 | if (PageDirty(page) || PageWriteback(page)) | ||
480 | *uptodate = true; | ||
481 | else | ||
482 | *uptodate = PageUptodate(page); | ||
483 | dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); | ||
484 | return page; | ||
485 | } | ||
953 | 486 | ||
954 | err: | 487 | static void __r4w_put_page(void *priv, struct page *page) |
955 | return ret; | 488 | { |
489 | dprintk("%s: index=0x%lx\n", __func__, page->index); | ||
490 | page_cache_release(page); | ||
491 | return; | ||
956 | } | 492 | } |
957 | 493 | ||
958 | static ssize_t _write_exec(struct objio_state *ios) | 494 | static const struct _ore_r4w_op _r4w_op = { |
495 | .get_page = &__r4w_get_page, | ||
496 | .put_page = &__r4w_put_page, | ||
497 | }; | ||
498 | |||
499 | int objio_write_pagelist(struct nfs_write_data *wdata, int how) | ||
959 | { | 500 | { |
960 | unsigned i; | 501 | struct objio_state *objios; |
961 | int ret; | 502 | int ret; |
962 | 503 | ||
963 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | 504 | ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, |
964 | if (!ios->per_dev[i].length) | 505 | wdata->lseg, wdata->args.pages, wdata->args.pgbase, |
965 | continue; | 506 | wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, |
966 | ret = _write_mirrors(ios, i); | 507 | &objios); |
967 | if (unlikely(ret)) | 508 | if (unlikely(ret)) |
968 | goto err; | 509 | return ret; |
969 | } | ||
970 | |||
971 | ios->done = _write_done; | ||
972 | return _io_exec(ios); /* In sync mode exec returns the io->status */ | ||
973 | 510 | ||
974 | err: | 511 | objios->sync = 0 != (how & FLUSH_SYNC); |
975 | _io_free(ios); | 512 | objios->ios->r4w = &_r4w_op; |
976 | return ret; | ||
977 | } | ||
978 | 513 | ||
979 | ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) | 514 | if (!objios->sync) |
980 | { | 515 | objios->ios->done = _write_done; |
981 | struct objio_state *ios = container_of(ol_state, struct objio_state, | ||
982 | ol_state); | ||
983 | int ret; | ||
984 | 516 | ||
985 | /* TODO: ios->stable = stable; */ | 517 | dprintk("%s: offset=0x%llx length=0x%x\n", __func__, |
986 | ret = _io_rw_pagelist(ios, GFP_NOFS); | 518 | wdata->args.offset, wdata->args.count); |
519 | ret = ore_write(objios->ios); | ||
987 | if (unlikely(ret)) | 520 | if (unlikely(ret)) |
988 | return ret; | 521 | return ret; |
989 | 522 | ||
990 | return _write_exec(ios); | 523 | if (objios->sync) |
524 | _write_done(objios->ios, objios); | ||
525 | |||
526 | return 0; | ||
991 | } | 527 | } |
992 | 528 | ||
993 | static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, | 529 | static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, |
@@ -997,7 +533,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, | |||
997 | return false; | 533 | return false; |
998 | 534 | ||
999 | return pgio->pg_count + req->wb_bytes <= | 535 | return pgio->pg_count + req->wb_bytes <= |
1000 | OBJIO_LSEG(pgio->pg_lseg)->max_io_size; | 536 | OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; |
1001 | } | 537 | } |
1002 | 538 | ||
1003 | static const struct nfs_pageio_ops objio_pg_read_ops = { | 539 | static const struct nfs_pageio_ops objio_pg_read_ops = { |
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e2ade..72074e3a04f 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c | |||
@@ -156,77 +156,39 @@ last_byte_offset(u64 start, u64 len) | |||
156 | return end > start ? end - 1 : NFS4_MAX_UINT64; | 156 | return end > start ? end - 1 : NFS4_MAX_UINT64; |
157 | } | 157 | } |
158 | 158 | ||
159 | static struct objlayout_io_state * | 159 | void _fix_verify_io_params(struct pnfs_layout_segment *lseg, |
160 | objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, | 160 | struct page ***p_pages, unsigned *p_pgbase, |
161 | struct page **pages, | 161 | u64 offset, unsigned long count) |
162 | unsigned pgbase, | ||
163 | loff_t offset, | ||
164 | size_t count, | ||
165 | struct pnfs_layout_segment *lseg, | ||
166 | void *rpcdata, | ||
167 | gfp_t gfp_flags) | ||
168 | { | 162 | { |
169 | struct objlayout_io_state *state; | ||
170 | u64 lseg_end_offset; | 163 | u64 lseg_end_offset; |
171 | 164 | ||
172 | dprintk("%s: allocating io_state\n", __func__); | ||
173 | if (objio_alloc_io_state(lseg, &state, gfp_flags)) | ||
174 | return NULL; | ||
175 | |||
176 | BUG_ON(offset < lseg->pls_range.offset); | 165 | BUG_ON(offset < lseg->pls_range.offset); |
177 | lseg_end_offset = end_offset(lseg->pls_range.offset, | 166 | lseg_end_offset = end_offset(lseg->pls_range.offset, |
178 | lseg->pls_range.length); | 167 | lseg->pls_range.length); |
179 | BUG_ON(offset >= lseg_end_offset); | 168 | BUG_ON(offset >= lseg_end_offset); |
180 | if (offset + count > lseg_end_offset) { | 169 | WARN_ON(offset + count > lseg_end_offset); |
181 | count = lseg->pls_range.length - | ||
182 | (offset - lseg->pls_range.offset); | ||
183 | dprintk("%s: truncated count %Zd\n", __func__, count); | ||
184 | } | ||
185 | 170 | ||
186 | if (pgbase > PAGE_SIZE) { | 171 | if (*p_pgbase > PAGE_SIZE) { |
187 | pages += pgbase >> PAGE_SHIFT; | 172 | dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); |
188 | pgbase &= ~PAGE_MASK; | 173 | *p_pages += *p_pgbase >> PAGE_SHIFT; |
174 | *p_pgbase &= ~PAGE_MASK; | ||
189 | } | 175 | } |
190 | |||
191 | INIT_LIST_HEAD(&state->err_list); | ||
192 | state->lseg = lseg; | ||
193 | state->rpcdata = rpcdata; | ||
194 | state->pages = pages; | ||
195 | state->pgbase = pgbase; | ||
196 | state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
197 | state->offset = offset; | ||
198 | state->count = count; | ||
199 | state->sync = 0; | ||
200 | |||
201 | return state; | ||
202 | } | ||
203 | |||
204 | static void | ||
205 | objlayout_free_io_state(struct objlayout_io_state *state) | ||
206 | { | ||
207 | dprintk("%s: freeing io_state\n", __func__); | ||
208 | if (unlikely(!state)) | ||
209 | return; | ||
210 | |||
211 | objio_free_io_state(state); | ||
212 | } | 176 | } |
213 | 177 | ||
214 | /* | 178 | /* |
215 | * I/O done common code | 179 | * I/O done common code |
216 | */ | 180 | */ |
217 | static void | 181 | static void |
218 | objlayout_iodone(struct objlayout_io_state *state) | 182 | objlayout_iodone(struct objlayout_io_res *oir) |
219 | { | 183 | { |
220 | dprintk("%s: state %p status\n", __func__, state); | 184 | if (likely(oir->status >= 0)) { |
221 | 185 | objio_free_result(oir); | |
222 | if (likely(state->status >= 0)) { | ||
223 | objlayout_free_io_state(state); | ||
224 | } else { | 186 | } else { |
225 | struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); | 187 | struct objlayout *objlay = oir->objlay; |
226 | 188 | ||
227 | spin_lock(&objlay->lock); | 189 | spin_lock(&objlay->lock); |
228 | objlay->delta_space_valid = OBJ_DSU_INVALID; | 190 | objlay->delta_space_valid = OBJ_DSU_INVALID; |
229 | list_add(&objlay->err_list, &state->err_list); | 191 | list_add(&objlay->err_list, &oir->err_list); |
230 | spin_unlock(&objlay->lock); | 192 | spin_unlock(&objlay->lock); |
231 | } | 193 | } |
232 | } | 194 | } |
@@ -238,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) | |||
238 | * the error for later reporting at layout-return. | 200 | * the error for later reporting at layout-return. |
239 | */ | 201 | */ |
240 | void | 202 | void |
241 | objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, | 203 | objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, |
242 | struct pnfs_osd_objid *pooid, int osd_error, | 204 | struct pnfs_osd_objid *pooid, int osd_error, |
243 | u64 offset, u64 length, bool is_write) | 205 | u64 offset, u64 length, bool is_write) |
244 | { | 206 | { |
245 | struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; | 207 | struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; |
246 | 208 | ||
247 | BUG_ON(index >= state->num_comps); | 209 | BUG_ON(index >= oir->num_comps); |
248 | if (osd_error) { | 210 | if (osd_error) { |
249 | ioerr->oer_component = *pooid; | 211 | ioerr->oer_component = *pooid; |
250 | ioerr->oer_comp_offset = offset; | 212 | ioerr->oer_comp_offset = offset; |
@@ -285,21 +247,18 @@ static void _rpc_read_complete(struct work_struct *work) | |||
285 | } | 247 | } |
286 | 248 | ||
287 | void | 249 | void |
288 | objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) | 250 | objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) |
289 | { | 251 | { |
290 | int eof = state->eof; | 252 | struct nfs_read_data *rdata = oir->rpcdata; |
291 | struct nfs_read_data *rdata; | ||
292 | 253 | ||
293 | state->status = status; | 254 | oir->status = rdata->task.tk_status = status; |
294 | dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); | 255 | if (status >= 0) |
295 | rdata = state->rpcdata; | ||
296 | rdata->task.tk_status = status; | ||
297 | if (status >= 0) { | ||
298 | rdata->res.count = status; | 256 | rdata->res.count = status; |
299 | rdata->res.eof = eof; | 257 | objlayout_iodone(oir); |
300 | } | 258 | /* must not use oir after this point */ |
301 | objlayout_iodone(state); | 259 | |
302 | /* must not use state after this point */ | 260 | dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, |
261 | status, rdata->res.eof, sync); | ||
303 | 262 | ||
304 | if (sync) | 263 | if (sync) |
305 | pnfs_ld_read_done(rdata); | 264 | pnfs_ld_read_done(rdata); |
@@ -317,40 +276,36 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) | |||
317 | { | 276 | { |
318 | loff_t offset = rdata->args.offset; | 277 | loff_t offset = rdata->args.offset; |
319 | size_t count = rdata->args.count; | 278 | size_t count = rdata->args.count; |
320 | struct objlayout_io_state *state; | 279 | int err; |
321 | ssize_t status = 0; | ||
322 | loff_t eof; | 280 | loff_t eof; |
323 | 281 | ||
324 | dprintk("%s: Begin inode %p offset %llu count %d\n", | ||
325 | __func__, rdata->inode, offset, (int)count); | ||
326 | |||
327 | eof = i_size_read(rdata->inode); | 282 | eof = i_size_read(rdata->inode); |
328 | if (unlikely(offset + count > eof)) { | 283 | if (unlikely(offset + count > eof)) { |
329 | if (offset >= eof) { | 284 | if (offset >= eof) { |
330 | status = 0; | 285 | err = 0; |
331 | rdata->res.count = 0; | 286 | rdata->res.count = 0; |
332 | rdata->res.eof = 1; | 287 | rdata->res.eof = 1; |
288 | /*FIXME: do we need to call pnfs_ld_read_done() */ | ||
333 | goto out; | 289 | goto out; |
334 | } | 290 | } |
335 | count = eof - offset; | 291 | count = eof - offset; |
336 | } | 292 | } |
337 | 293 | ||
338 | state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, | 294 | rdata->res.eof = (offset + count) >= eof; |
339 | rdata->args.pages, rdata->args.pgbase, | 295 | _fix_verify_io_params(rdata->lseg, &rdata->args.pages, |
340 | offset, count, | 296 | &rdata->args.pgbase, |
341 | rdata->lseg, rdata, | 297 | rdata->args.offset, rdata->args.count); |
342 | GFP_KERNEL); | ||
343 | if (unlikely(!state)) { | ||
344 | status = -ENOMEM; | ||
345 | goto out; | ||
346 | } | ||
347 | 298 | ||
348 | state->eof = state->offset + state->count >= eof; | 299 | dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", |
300 | __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); | ||
349 | 301 | ||
350 | status = objio_read_pagelist(state); | 302 | err = objio_read_pagelist(rdata); |
351 | out: | 303 | out: |
352 | dprintk("%s: Return status %Zd\n", __func__, status); | 304 | if (unlikely(err)) { |
353 | rdata->pnfs_error = status; | 305 | rdata->pnfs_error = err; |
306 | dprintk("%s: Returned Error %d\n", __func__, err); | ||
307 | return PNFS_NOT_ATTEMPTED; | ||
308 | } | ||
354 | return PNFS_ATTEMPTED; | 309 | return PNFS_ATTEMPTED; |
355 | } | 310 | } |
356 | 311 | ||
@@ -371,26 +326,20 @@ static void _rpc_write_complete(struct work_struct *work) | |||
371 | } | 326 | } |
372 | 327 | ||
373 | void | 328 | void |
374 | objlayout_write_done(struct objlayout_io_state *state, ssize_t status, | 329 | objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) |
375 | bool sync) | ||
376 | { | 330 | { |
377 | struct nfs_write_data *wdata; | 331 | struct nfs_write_data *wdata = oir->rpcdata; |
378 | 332 | ||
379 | dprintk("%s: Begin\n", __func__); | 333 | oir->status = wdata->task.tk_status = status; |
380 | wdata = state->rpcdata; | ||
381 | state->status = status; | ||
382 | wdata->task.tk_status = status; | ||
383 | if (status >= 0) { | 334 | if (status >= 0) { |
384 | wdata->res.count = status; | 335 | wdata->res.count = status; |
385 | wdata->verf.committed = state->committed; | 336 | wdata->verf.committed = oir->committed; |
386 | dprintk("%s: Return status %d committed %d\n", | 337 | } |
387 | __func__, wdata->task.tk_status, | 338 | objlayout_iodone(oir); |
388 | wdata->verf.committed); | 339 | /* must not use oir after this point */ |
389 | } else | 340 | |
390 | dprintk("%s: Return status %d\n", | 341 | dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, |
391 | __func__, wdata->task.tk_status); | 342 | status, wdata->verf.committed, sync); |
392 | objlayout_iodone(state); | ||
393 | /* must not use state after this point */ | ||
394 | 343 | ||
395 | if (sync) | 344 | if (sync) |
396 | pnfs_ld_write_done(wdata); | 345 | pnfs_ld_write_done(wdata); |
@@ -407,30 +356,18 @@ enum pnfs_try_status | |||
407 | objlayout_write_pagelist(struct nfs_write_data *wdata, | 356 | objlayout_write_pagelist(struct nfs_write_data *wdata, |
408 | int how) | 357 | int how) |
409 | { | 358 | { |
410 | struct objlayout_io_state *state; | 359 | int err; |
411 | ssize_t status; | ||
412 | |||
413 | dprintk("%s: Begin inode %p offset %llu count %u\n", | ||
414 | __func__, wdata->inode, wdata->args.offset, wdata->args.count); | ||
415 | |||
416 | state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, | ||
417 | wdata->args.pages, | ||
418 | wdata->args.pgbase, | ||
419 | wdata->args.offset, | ||
420 | wdata->args.count, | ||
421 | wdata->lseg, wdata, | ||
422 | GFP_NOFS); | ||
423 | if (unlikely(!state)) { | ||
424 | status = -ENOMEM; | ||
425 | goto out; | ||
426 | } | ||
427 | 360 | ||
428 | state->sync = how & FLUSH_SYNC; | 361 | _fix_verify_io_params(wdata->lseg, &wdata->args.pages, |
362 | &wdata->args.pgbase, | ||
363 | wdata->args.offset, wdata->args.count); | ||
429 | 364 | ||
430 | status = objio_write_pagelist(state, how & FLUSH_STABLE); | 365 | err = objio_write_pagelist(wdata, how); |
431 | out: | 366 | if (unlikely(err)) { |
432 | dprintk("%s: Return status %Zd\n", __func__, status); | 367 | wdata->pnfs_error = err; |
433 | wdata->pnfs_error = status; | 368 | dprintk("%s: Returned Error %d\n", __func__, err); |
369 | return PNFS_NOT_ATTEMPTED; | ||
370 | } | ||
434 | return PNFS_ATTEMPTED; | 371 | return PNFS_ATTEMPTED; |
435 | } | 372 | } |
436 | 373 | ||
@@ -537,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, | |||
537 | static void | 474 | static void |
538 | encode_accumulated_error(struct objlayout *objlay, __be32 *p) | 475 | encode_accumulated_error(struct objlayout *objlay, __be32 *p) |
539 | { | 476 | { |
540 | struct objlayout_io_state *state, *tmp; | 477 | struct objlayout_io_res *oir, *tmp; |
541 | struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; | 478 | struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; |
542 | 479 | ||
543 | list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { | 480 | list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { |
544 | unsigned i; | 481 | unsigned i; |
545 | 482 | ||
546 | for (i = 0; i < state->num_comps; i++) { | 483 | for (i = 0; i < oir->num_comps; i++) { |
547 | struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; | 484 | struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; |
548 | 485 | ||
549 | if (!ioerr->oer_errno) | 486 | if (!ioerr->oer_errno) |
550 | continue; | 487 | continue; |
@@ -563,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) | |||
563 | 500 | ||
564 | merge_ioerr(&accumulated_err, ioerr); | 501 | merge_ioerr(&accumulated_err, ioerr); |
565 | } | 502 | } |
566 | list_del(&state->err_list); | 503 | list_del(&oir->err_list); |
567 | objlayout_free_io_state(state); | 504 | objio_free_result(oir); |
568 | } | 505 | } |
569 | 506 | ||
570 | pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); | 507 | pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); |
@@ -576,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, | |||
576 | const struct nfs4_layoutreturn_args *args) | 513 | const struct nfs4_layoutreturn_args *args) |
577 | { | 514 | { |
578 | struct objlayout *objlay = OBJLAYOUT(pnfslay); | 515 | struct objlayout *objlay = OBJLAYOUT(pnfslay); |
579 | struct objlayout_io_state *state, *tmp; | 516 | struct objlayout_io_res *oir, *tmp; |
580 | __be32 *start; | 517 | __be32 *start; |
581 | 518 | ||
582 | dprintk("%s: Begin\n", __func__); | 519 | dprintk("%s: Begin\n", __func__); |
@@ -585,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, | |||
585 | 522 | ||
586 | spin_lock(&objlay->lock); | 523 | spin_lock(&objlay->lock); |
587 | 524 | ||
588 | list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { | 525 | list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { |
589 | __be32 *last_xdr = NULL, *p; | 526 | __be32 *last_xdr = NULL, *p; |
590 | unsigned i; | 527 | unsigned i; |
591 | int res = 0; | 528 | int res = 0; |
592 | 529 | ||
593 | for (i = 0; i < state->num_comps; i++) { | 530 | for (i = 0; i < oir->num_comps; i++) { |
594 | struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; | 531 | struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; |
595 | 532 | ||
596 | if (!ioerr->oer_errno) | 533 | if (!ioerr->oer_errno) |
597 | continue; | 534 | continue; |
@@ -615,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, | |||
615 | } | 552 | } |
616 | 553 | ||
617 | last_xdr = p; | 554 | last_xdr = p; |
618 | pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); | 555 | pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); |
619 | } | 556 | } |
620 | 557 | ||
621 | /* TODO: use xdr_write_pages */ | 558 | /* TODO: use xdr_write_pages */ |
@@ -631,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, | |||
631 | encode_accumulated_error(objlay, last_xdr); | 568 | encode_accumulated_error(objlay, last_xdr); |
632 | goto loop_done; | 569 | goto loop_done; |
633 | } | 570 | } |
634 | list_del(&state->err_list); | 571 | list_del(&oir->err_list); |
635 | objlayout_free_io_state(state); | 572 | objio_free_result(oir); |
636 | } | 573 | } |
637 | loop_done: | 574 | loop_done: |
638 | spin_unlock(&objlay->lock); | 575 | spin_unlock(&objlay->lock); |
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8e042..8ec34727ed2 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h | |||
@@ -74,19 +74,11 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) | |||
74 | * per-I/O operation state | 74 | * per-I/O operation state |
75 | * embedded in objects provider io_state data structure | 75 | * embedded in objects provider io_state data structure |
76 | */ | 76 | */ |
77 | struct objlayout_io_state { | 77 | struct objlayout_io_res { |
78 | struct pnfs_layout_segment *lseg; | 78 | struct objlayout *objlay; |
79 | |||
80 | struct page **pages; | ||
81 | unsigned pgbase; | ||
82 | unsigned nr_pages; | ||
83 | unsigned long count; | ||
84 | loff_t offset; | ||
85 | bool sync; | ||
86 | 79 | ||
87 | void *rpcdata; | 80 | void *rpcdata; |
88 | int status; /* res */ | 81 | int status; /* res */ |
89 | int eof; /* res */ | ||
90 | int committed; /* res */ | 82 | int committed; /* res */ |
91 | 83 | ||
92 | /* Error reporting (layout_return) */ | 84 | /* Error reporting (layout_return) */ |
@@ -100,6 +92,18 @@ struct objlayout_io_state { | |||
100 | struct pnfs_osd_ioerr *ioerrs; | 92 | struct pnfs_osd_ioerr *ioerrs; |
101 | }; | 93 | }; |
102 | 94 | ||
95 | static inline | ||
96 | void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, | ||
97 | struct pnfs_osd_ioerr *ioerrs, void *rpcdata, | ||
98 | struct pnfs_layout_hdr *pnfs_layout_type) | ||
99 | { | ||
100 | oir->objlay = OBJLAYOUT(pnfs_layout_type); | ||
101 | oir->rpcdata = rpcdata; | ||
102 | INIT_LIST_HEAD(&oir->err_list); | ||
103 | oir->num_comps = num_comps; | ||
104 | oir->ioerrs = ioerrs; | ||
105 | } | ||
106 | |||
103 | /* | 107 | /* |
104 | * Raid engine I/O API | 108 | * Raid engine I/O API |
105 | */ | 109 | */ |
@@ -110,28 +114,24 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, | |||
110 | gfp_t gfp_flags); | 114 | gfp_t gfp_flags); |
111 | extern void objio_free_lseg(struct pnfs_layout_segment *lseg); | 115 | extern void objio_free_lseg(struct pnfs_layout_segment *lseg); |
112 | 116 | ||
113 | extern int objio_alloc_io_state( | 117 | /* objio_free_result will free these @oir structs recieved from |
114 | struct pnfs_layout_segment *lseg, | 118 | * objlayout_{read,write}_done |
115 | struct objlayout_io_state **outp, | 119 | */ |
116 | gfp_t gfp_flags); | 120 | extern void objio_free_result(struct objlayout_io_res *oir); |
117 | extern void objio_free_io_state(struct objlayout_io_state *state); | ||
118 | 121 | ||
119 | extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); | 122 | extern int objio_read_pagelist(struct nfs_read_data *rdata); |
120 | extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, | 123 | extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); |
121 | bool stable); | ||
122 | 124 | ||
123 | /* | 125 | /* |
124 | * callback API | 126 | * callback API |
125 | */ | 127 | */ |
126 | extern void objlayout_io_set_result(struct objlayout_io_state *state, | 128 | extern void objlayout_io_set_result(struct objlayout_io_res *oir, |
127 | unsigned index, struct pnfs_osd_objid *pooid, | 129 | unsigned index, struct pnfs_osd_objid *pooid, |
128 | int osd_error, u64 offset, u64 length, bool is_write); | 130 | int osd_error, u64 offset, u64 length, bool is_write); |
129 | 131 | ||
130 | static inline void | 132 | static inline void |
131 | objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) | 133 | objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) |
132 | { | 134 | { |
133 | struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); | ||
134 | |||
135 | /* If one of the I/Os errored out and the delta_space_used was | 135 | /* If one of the I/Os errored out and the delta_space_used was |
136 | * invalid we render the complete report as invalid. Protocol mandate | 136 | * invalid we render the complete report as invalid. Protocol mandate |
137 | * the DSU be accurate or not reported. | 137 | * the DSU be accurate or not reported. |
@@ -144,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) | |||
144 | spin_unlock(&objlay->lock); | 144 | spin_unlock(&objlay->lock); |
145 | } | 145 | } |
146 | 146 | ||
147 | extern void objlayout_read_done(struct objlayout_io_state *state, | 147 | extern void objlayout_read_done(struct objlayout_io_res *oir, |
148 | ssize_t status, bool sync); | 148 | ssize_t status, bool sync); |
149 | extern void objlayout_write_done(struct objlayout_io_state *state, | 149 | extern void objlayout_write_done(struct objlayout_io_res *oir, |
150 | ssize_t status, bool sync); | 150 | ssize_t status, bool sync); |
151 | 151 | ||
152 | extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, | 152 | extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, |
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970cc7f1..5668f7c54c4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/nfs_page.h> | 18 | #include <linux/nfs_page.h> |
19 | #include <linux/nfs_fs.h> | 19 | #include <linux/nfs_fs.h> |
20 | #include <linux/nfs_mount.h> | 20 | #include <linux/nfs_mount.h> |
21 | #include <linux/export.h> | ||
21 | 22 | ||
22 | #include "internal.h" | 23 | #include "internal.h" |
23 | #include "pnfs.h" | 24 | #include "pnfs.h" |
@@ -41,7 +42,7 @@ nfs_page_free(struct nfs_page *p) | |||
41 | 42 | ||
42 | /** | 43 | /** |
43 | * nfs_create_request - Create an NFS read/write request. | 44 | * nfs_create_request - Create an NFS read/write request. |
44 | * @file: file descriptor to use | 45 | * @ctx: open context to use |
45 | * @inode: inode to which the request is attached | 46 | * @inode: inode to which the request is attached |
46 | * @page: page to write | 47 | * @page: page to write |
47 | * @offset: starting offset within the page for the write | 48 | * @offset: starting offset within the page for the write |
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee73d9a4f70..8e672a2b2d6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c | |||
@@ -29,6 +29,7 @@ | |||
29 | 29 | ||
30 | #include <linux/nfs_fs.h> | 30 | #include <linux/nfs_fs.h> |
31 | #include <linux/nfs_page.h> | 31 | #include <linux/nfs_page.h> |
32 | #include <linux/module.h> | ||
32 | #include "internal.h" | 33 | #include "internal.h" |
33 | #include "pnfs.h" | 34 | #include "pnfs.h" |
34 | #include "iostat.h" | 35 | #include "iostat.h" |
@@ -1259,6 +1260,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) | |||
1259 | } | 1260 | } |
1260 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); | 1261 | EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); |
1261 | 1262 | ||
1263 | static void pnfs_ld_handle_read_error(struct nfs_read_data *data) | ||
1264 | { | ||
1265 | struct nfs_pageio_descriptor pgio; | ||
1266 | |||
1267 | put_lseg(data->lseg); | ||
1268 | data->lseg = NULL; | ||
1269 | dprintk("pnfs write error = %d\n", data->pnfs_error); | ||
1270 | |||
1271 | nfs_pageio_init_read_mds(&pgio, data->inode); | ||
1272 | |||
1273 | while (!list_empty(&data->pages)) { | ||
1274 | struct nfs_page *req = nfs_list_entry(data->pages.next); | ||
1275 | |||
1276 | nfs_list_remove_request(req); | ||
1277 | nfs_pageio_add_request(&pgio, req); | ||
1278 | } | ||
1279 | nfs_pageio_complete(&pgio); | ||
1280 | } | ||
1281 | |||
1262 | /* | 1282 | /* |
1263 | * Called by non rpc-based layout drivers | 1283 | * Called by non rpc-based layout drivers |
1264 | */ | 1284 | */ |
@@ -1267,11 +1287,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data) | |||
1267 | if (likely(!data->pnfs_error)) { | 1287 | if (likely(!data->pnfs_error)) { |
1268 | __nfs4_read_done_cb(data); | 1288 | __nfs4_read_done_cb(data); |
1269 | data->mds_ops->rpc_call_done(&data->task, data); | 1289 | data->mds_ops->rpc_call_done(&data->task, data); |
1270 | } else { | 1290 | } else |
1271 | put_lseg(data->lseg); | 1291 | pnfs_ld_handle_read_error(data); |
1272 | data->lseg = NULL; | ||
1273 | dprintk("pnfs write error = %d\n", data->pnfs_error); | ||
1274 | } | ||
1275 | data->mds_ops->rpc_release(data); | 1292 | data->mds_ops->rpc_release(data); |
1276 | } | 1293 | } |
1277 | EXPORT_SYMBOL_GPL(pnfs_ld_read_done); | 1294 | EXPORT_SYMBOL_GPL(pnfs_ld_read_done); |
@@ -1443,17 +1460,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) | |||
1443 | /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ | 1460 | /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ |
1444 | data = kzalloc(sizeof(*data), GFP_NOFS); | 1461 | data = kzalloc(sizeof(*data), GFP_NOFS); |
1445 | if (!data) { | 1462 | if (!data) { |
1446 | mark_inode_dirty_sync(inode); | ||
1447 | status = -ENOMEM; | 1463 | status = -ENOMEM; |
1448 | goto out; | 1464 | goto out; |
1449 | } | 1465 | } |
1450 | 1466 | ||
1467 | if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) | ||
1468 | goto out_free; | ||
1469 | |||
1470 | if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { | ||
1471 | if (!sync) { | ||
1472 | status = -EAGAIN; | ||
1473 | goto out_free; | ||
1474 | } | ||
1475 | status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, | ||
1476 | nfs_wait_bit_killable, TASK_KILLABLE); | ||
1477 | if (status) | ||
1478 | goto out_free; | ||
1479 | } | ||
1480 | |||
1451 | INIT_LIST_HEAD(&data->lseg_list); | 1481 | INIT_LIST_HEAD(&data->lseg_list); |
1452 | spin_lock(&inode->i_lock); | 1482 | spin_lock(&inode->i_lock); |
1453 | if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { | 1483 | if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { |
1484 | clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); | ||
1454 | spin_unlock(&inode->i_lock); | 1485 | spin_unlock(&inode->i_lock); |
1455 | kfree(data); | 1486 | wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); |
1456 | goto out; | 1487 | goto out_free; |
1457 | } | 1488 | } |
1458 | 1489 | ||
1459 | pnfs_list_write_lseg(inode, &data->lseg_list); | 1490 | pnfs_list_write_lseg(inode, &data->lseg_list); |
@@ -1475,6 +1506,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) | |||
1475 | 1506 | ||
1476 | status = nfs4_proc_layoutcommit(data, sync); | 1507 | status = nfs4_proc_layoutcommit(data, sync); |
1477 | out: | 1508 | out: |
1509 | if (status) | ||
1510 | mark_inode_dirty_sync(inode); | ||
1478 | dprintk("<-- %s status %d\n", __func__, status); | 1511 | dprintk("<-- %s status %d\n", __func__, status); |
1479 | return status; | 1512 | return status; |
1513 | out_free: | ||
1514 | kfree(data); | ||
1515 | goto out; | ||
1480 | } | 1516 | } |
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6fda5228ef5..4f359d2a26e 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c | |||
@@ -28,6 +28,7 @@ | |||
28 | * such damages. | 28 | * such damages. |
29 | */ | 29 | */ |
30 | 30 | ||
31 | #include <linux/export.h> | ||
31 | #include "pnfs.h" | 32 | #include "pnfs.h" |
32 | 33 | ||
33 | #define NFSDBG_FACILITY NFSDBG_PNFS | 34 | #define NFSDBG_FACILITY NFSDBG_PNFS |
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b8535d7..f48125da198 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c | |||
@@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { | |||
710 | .dentry_ops = &nfs_dentry_operations, | 710 | .dentry_ops = &nfs_dentry_operations, |
711 | .dir_inode_ops = &nfs_dir_inode_operations, | 711 | .dir_inode_ops = &nfs_dir_inode_operations, |
712 | .file_inode_ops = &nfs_file_inode_operations, | 712 | .file_inode_ops = &nfs_file_inode_operations, |
713 | .file_ops = &nfs_file_operations, | ||
713 | .getroot = nfs_proc_get_root, | 714 | .getroot = nfs_proc_get_root, |
714 | .getattr = nfs_proc_getattr, | 715 | .getattr = nfs_proc_getattr, |
715 | .setattr = nfs_proc_setattr, | 716 | .setattr = nfs_proc_setattr, |
diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8b48ec63f72..cfa175c223d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c | |||
@@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) | |||
109 | } | 109 | } |
110 | } | 110 | } |
111 | 111 | ||
112 | static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, | 112 | void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, |
113 | struct inode *inode) | 113 | struct inode *inode) |
114 | { | 114 | { |
115 | nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, | 115 | nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, |
@@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) | |||
534 | static void nfs_readpage_release_full(void *calldata) | 534 | static void nfs_readpage_release_full(void *calldata) |
535 | { | 535 | { |
536 | struct nfs_read_data *data = calldata; | 536 | struct nfs_read_data *data = calldata; |
537 | struct nfs_pageio_descriptor pgio; | ||
538 | 537 | ||
539 | if (data->pnfs_error) { | ||
540 | nfs_pageio_init_read_mds(&pgio, data->inode); | ||
541 | pgio.pg_recoalesce = 1; | ||
542 | } | ||
543 | while (!list_empty(&data->pages)) { | 538 | while (!list_empty(&data->pages)) { |
544 | struct nfs_page *req = nfs_list_entry(data->pages.next); | 539 | struct nfs_page *req = nfs_list_entry(data->pages.next); |
545 | 540 | ||
546 | nfs_list_remove_request(req); | 541 | nfs_list_remove_request(req); |
547 | if (!data->pnfs_error) | 542 | nfs_readpage_release(req); |
548 | nfs_readpage_release(req); | ||
549 | else | ||
550 | nfs_pageio_add_request(&pgio, req); | ||
551 | } | 543 | } |
552 | if (data->pnfs_error) | ||
553 | nfs_pageio_complete(&pgio); | ||
554 | nfs_readdata_release(calldata); | 544 | nfs_readdata_release(calldata); |
555 | } | 545 | } |
556 | 546 | ||
diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 480b3b6bf71..134777406ee 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c | |||
@@ -2787,43 +2787,18 @@ static void nfs_referral_loop_unprotect(void) | |||
2787 | static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, | 2787 | static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, |
2788 | const char *export_path) | 2788 | const char *export_path) |
2789 | { | 2789 | { |
2790 | struct mnt_namespace *ns_private; | ||
2791 | struct super_block *s; | ||
2792 | struct dentry *dentry; | 2790 | struct dentry *dentry; |
2793 | struct path path; | 2791 | int ret = nfs_referral_loop_protect(); |
2794 | int ret; | ||
2795 | |||
2796 | ns_private = create_mnt_ns(root_mnt); | ||
2797 | ret = PTR_ERR(ns_private); | ||
2798 | if (IS_ERR(ns_private)) | ||
2799 | goto out_mntput; | ||
2800 | |||
2801 | ret = nfs_referral_loop_protect(); | ||
2802 | if (ret != 0) | ||
2803 | goto out_put_mnt_ns; | ||
2804 | 2792 | ||
2805 | ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, | 2793 | if (ret) { |
2806 | export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); | 2794 | mntput(root_mnt); |
2795 | return ERR_PTR(ret); | ||
2796 | } | ||
2807 | 2797 | ||
2798 | dentry = mount_subtree(root_mnt, export_path); | ||
2808 | nfs_referral_loop_unprotect(); | 2799 | nfs_referral_loop_unprotect(); |
2809 | put_mnt_ns(ns_private); | ||
2810 | |||
2811 | if (ret != 0) | ||
2812 | goto out_err; | ||
2813 | |||
2814 | s = path.mnt->mnt_sb; | ||
2815 | atomic_inc(&s->s_active); | ||
2816 | dentry = dget(path.dentry); | ||
2817 | 2800 | ||
2818 | path_put(&path); | ||
2819 | down_write(&s->s_umount); | ||
2820 | return dentry; | 2801 | return dentry; |
2821 | out_put_mnt_ns: | ||
2822 | put_mnt_ns(ns_private); | ||
2823 | out_mntput: | ||
2824 | mntput(root_mnt); | ||
2825 | out_err: | ||
2826 | return ERR_PTR(ret); | ||
2827 | } | 2802 | } |
2828 | 2803 | ||
2829 | static struct dentry *nfs4_try_mount(int flags, const char *dev_name, | 2804 | static struct dentry *nfs4_try_mount(int flags, const char *dev_name, |
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2219c88d96b..1dda78db6a7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/nfs_mount.h> | 20 | #include <linux/nfs_mount.h> |
21 | #include <linux/nfs_page.h> | 21 | #include <linux/nfs_page.h> |
22 | #include <linux/backing-dev.h> | 22 | #include <linux/backing-dev.h> |
23 | #include <linux/export.h> | ||
23 | 24 | ||
24 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
25 | 26 | ||
@@ -1243,7 +1244,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) | |||
1243 | { | 1244 | { |
1244 | struct nfs_writeargs *argp = &data->args; | 1245 | struct nfs_writeargs *argp = &data->args; |
1245 | struct nfs_writeres *resp = &data->res; | 1246 | struct nfs_writeres *resp = &data->res; |
1246 | struct nfs_server *server = NFS_SERVER(data->inode); | ||
1247 | int status; | 1247 | int status; |
1248 | 1248 | ||
1249 | dprintk("NFS: %5u nfs_writeback_done (status %d)\n", | 1249 | dprintk("NFS: %5u nfs_writeback_done (status %d)\n", |
@@ -1277,7 +1277,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) | |||
1277 | if (time_before(complain, jiffies)) { | 1277 | if (time_before(complain, jiffies)) { |
1278 | dprintk("NFS: faulty NFS server %s:" | 1278 | dprintk("NFS: faulty NFS server %s:" |
1279 | " (committed = %d) != (stable = %d)\n", | 1279 | " (committed = %d) != (stable = %d)\n", |
1280 | server->nfs_client->cl_hostname, | 1280 | NFS_SERVER(data->inode)->nfs_client->cl_hostname, |
1281 | resp->verf->committed, argp->stable); | 1281 | resp->verf->committed, argp->stable); |
1282 | complain = jiffies + 300 * HZ; | 1282 | complain = jiffies + 300 * HZ; |
1283 | } | 1283 | } |
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index ad88f1c0a4c..9c51aff02ae 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c | |||
@@ -36,6 +36,7 @@ | |||
36 | 36 | ||
37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
38 | #include <linux/nfs_fs.h> | 38 | #include <linux/nfs_fs.h> |
39 | #include <linux/export.h> | ||
39 | #include "acl.h" | 40 | #include "acl.h" |
40 | 41 | ||
41 | 42 | ||
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index db34a585e11..c45a2ea4a09 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/sunrpc/clnt.h> | 13 | #include <linux/sunrpc/clnt.h> |
14 | #include <linux/sunrpc/gss_api.h> | 14 | #include <linux/sunrpc/gss_api.h> |
15 | #include <linux/sunrpc/gss_krb5_enctypes.h> | 15 | #include <linux/sunrpc/gss_krb5_enctypes.h> |
16 | #include <linux/module.h> | ||
16 | 17 | ||
17 | #include "idmap.h" | 18 | #include "idmap.h" |
18 | #include "nfsd.h" | 19 | #include "nfsd.h" |
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf476b..eda7d7e55e0 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c | |||
@@ -8,6 +8,7 @@ | |||
8 | 8 | ||
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/freezer.h> | 10 | #include <linux/freezer.h> |
11 | #include <linux/module.h> | ||
11 | #include <linux/fs_struct.h> | 12 | #include <linux/fs_struct.h> |
12 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
13 | 14 | ||
@@ -256,6 +257,8 @@ static void nfsd_last_thread(struct svc_serv *serv) | |||
256 | nfsd_serv = NULL; | 257 | nfsd_serv = NULL; |
257 | nfsd_shutdown(); | 258 | nfsd_shutdown(); |
258 | 259 | ||
260 | svc_rpcb_cleanup(serv); | ||
261 | |||
259 | printk(KERN_WARNING "nfsd: last server has exited, flushing export " | 262 | printk(KERN_WARNING "nfsd: last server has exited, flushing export " |
260 | "cache\n"); | 263 | "cache\n"); |
261 | nfsd_export_flush(); | 264 | nfsd_export_flush(); |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ed553c60de8..3165aebb43c 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode, | |||
5699 | OCFS2_JOURNAL_ACCESS_WRITE); | 5699 | OCFS2_JOURNAL_ACCESS_WRITE); |
5700 | if (ret) { | 5700 | if (ret) { |
5701 | mlog_errno(ret); | 5701 | mlog_errno(ret); |
5702 | goto out; | 5702 | goto out_commit; |
5703 | } | 5703 | } |
5704 | 5704 | ||
5705 | dquot_free_space_nodirty(inode, | 5705 | dquot_free_space_nodirty(inode, |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c1efe939c77..78b68af3b0e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page) | |||
290 | } | 290 | } |
291 | 291 | ||
292 | if (down_read_trylock(&oi->ip_alloc_sem) == 0) { | 292 | if (down_read_trylock(&oi->ip_alloc_sem) == 0) { |
293 | /* | ||
294 | * Unlock the page and cycle ip_alloc_sem so that we don't | ||
295 | * busyloop waiting for ip_alloc_sem to unlock | ||
296 | */ | ||
293 | ret = AOP_TRUNCATED_PAGE; | 297 | ret = AOP_TRUNCATED_PAGE; |
298 | unlock_page(page); | ||
299 | unlock = 0; | ||
300 | down_read(&oi->ip_alloc_sem); | ||
301 | up_read(&oi->ip_alloc_sem); | ||
294 | goto out_inode_unlock; | 302 | goto out_inode_unlock; |
295 | } | 303 | } |
296 | 304 | ||
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
563 | { | 571 | { |
564 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 572 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
565 | int level; | 573 | int level; |
574 | wait_queue_head_t *wq = ocfs2_ioend_wq(inode); | ||
566 | 575 | ||
567 | /* this io's submitter should not have unlocked this before we could */ | 576 | /* this io's submitter should not have unlocked this before we could */ |
568 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 577 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
570 | if (ocfs2_iocb_is_sem_locked(iocb)) | 579 | if (ocfs2_iocb_is_sem_locked(iocb)) |
571 | ocfs2_iocb_clear_sem_locked(iocb); | 580 | ocfs2_iocb_clear_sem_locked(iocb); |
572 | 581 | ||
582 | if (ocfs2_iocb_is_unaligned_aio(iocb)) { | ||
583 | ocfs2_iocb_clear_unaligned_aio(iocb); | ||
584 | |||
585 | if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) && | ||
586 | waitqueue_active(wq)) { | ||
587 | wake_up_all(wq); | ||
588 | } | ||
589 | } | ||
590 | |||
573 | ocfs2_iocb_clear_rw_locked(iocb); | 591 | ocfs2_iocb_clear_rw_locked(iocb); |
574 | 592 | ||
575 | level = ocfs2_iocb_rw_locked_level(iocb); | 593 | level = ocfs2_iocb_rw_locked_level(iocb); |
@@ -863,6 +881,12 @@ struct ocfs2_write_ctxt { | |||
863 | struct page *w_target_page; | 881 | struct page *w_target_page; |
864 | 882 | ||
865 | /* | 883 | /* |
884 | * w_target_locked is used for page_mkwrite path indicating no unlocking | ||
885 | * against w_target_page in ocfs2_write_end_nolock. | ||
886 | */ | ||
887 | unsigned int w_target_locked:1; | ||
888 | |||
889 | /* | ||
866 | * ocfs2_write_end() uses this to know what the real range to | 890 | * ocfs2_write_end() uses this to know what the real range to |
867 | * write in the target should be. | 891 | * write in the target should be. |
868 | */ | 892 | */ |
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) | |||
895 | 919 | ||
896 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) | 920 | static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) |
897 | { | 921 | { |
922 | int i; | ||
923 | |||
924 | /* | ||
925 | * w_target_locked is only set to true in the page_mkwrite() case. | ||
926 | * The intent is to allow us to lock the target page from write_begin() | ||
927 | * to write_end(). The caller must hold a ref on w_target_page. | ||
928 | */ | ||
929 | if (wc->w_target_locked) { | ||
930 | BUG_ON(!wc->w_target_page); | ||
931 | for (i = 0; i < wc->w_num_pages; i++) { | ||
932 | if (wc->w_target_page == wc->w_pages[i]) { | ||
933 | wc->w_pages[i] = NULL; | ||
934 | break; | ||
935 | } | ||
936 | } | ||
937 | mark_page_accessed(wc->w_target_page); | ||
938 | page_cache_release(wc->w_target_page); | ||
939 | } | ||
898 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); | 940 | ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); |
899 | 941 | ||
900 | brelse(wc->w_di_bh); | 942 | brelse(wc->w_di_bh); |
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1132 | */ | 1174 | */ |
1133 | lock_page(mmap_page); | 1175 | lock_page(mmap_page); |
1134 | 1176 | ||
1177 | /* Exit and let the caller retry */ | ||
1135 | if (mmap_page->mapping != mapping) { | 1178 | if (mmap_page->mapping != mapping) { |
1179 | WARN_ON(mmap_page->mapping); | ||
1136 | unlock_page(mmap_page); | 1180 | unlock_page(mmap_page); |
1137 | /* | 1181 | ret = -EAGAIN; |
1138 | * Sanity check - the locking in | ||
1139 | * ocfs2_pagemkwrite() should ensure | ||
1140 | * that this code doesn't trigger. | ||
1141 | */ | ||
1142 | ret = -EINVAL; | ||
1143 | mlog_errno(ret); | ||
1144 | goto out; | 1182 | goto out; |
1145 | } | 1183 | } |
1146 | 1184 | ||
1147 | page_cache_get(mmap_page); | 1185 | page_cache_get(mmap_page); |
1148 | wc->w_pages[i] = mmap_page; | 1186 | wc->w_pages[i] = mmap_page; |
1187 | wc->w_target_locked = true; | ||
1149 | } else { | 1188 | } else { |
1150 | wc->w_pages[i] = find_or_create_page(mapping, index, | 1189 | wc->w_pages[i] = find_or_create_page(mapping, index, |
1151 | GFP_NOFS); | 1190 | GFP_NOFS); |
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, | |||
1160 | wc->w_target_page = wc->w_pages[i]; | 1199 | wc->w_target_page = wc->w_pages[i]; |
1161 | } | 1200 | } |
1162 | out: | 1201 | out: |
1202 | if (ret) | ||
1203 | wc->w_target_locked = false; | ||
1163 | return ret; | 1204 | return ret; |
1164 | } | 1205 | } |
1165 | 1206 | ||
@@ -1817,11 +1858,23 @@ try_again: | |||
1817 | */ | 1858 | */ |
1818 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, | 1859 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, |
1819 | cluster_of_pages, mmap_page); | 1860 | cluster_of_pages, mmap_page); |
1820 | if (ret) { | 1861 | if (ret && ret != -EAGAIN) { |
1821 | mlog_errno(ret); | 1862 | mlog_errno(ret); |
1822 | goto out_quota; | 1863 | goto out_quota; |
1823 | } | 1864 | } |
1824 | 1865 | ||
1866 | /* | ||
1867 | * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock | ||
1868 | * the target page. In this case, we exit with no error and no target | ||
1869 | * page. This will trigger the caller, page_mkwrite(), to re-try | ||
1870 | * the operation. | ||
1871 | */ | ||
1872 | if (ret == -EAGAIN) { | ||
1873 | BUG_ON(wc->w_target_page); | ||
1874 | ret = 0; | ||
1875 | goto out_quota; | ||
1876 | } | ||
1877 | |||
1825 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, | 1878 | ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, |
1826 | len); | 1879 | len); |
1827 | if (ret) { | 1880 | if (ret) { |
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 75cf3ad987a..ffb2da370a9 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h | |||
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits { | |||
78 | OCFS2_IOCB_RW_LOCK = 0, | 78 | OCFS2_IOCB_RW_LOCK = 0, |
79 | OCFS2_IOCB_RW_LOCK_LEVEL, | 79 | OCFS2_IOCB_RW_LOCK_LEVEL, |
80 | OCFS2_IOCB_SEM, | 80 | OCFS2_IOCB_SEM, |
81 | OCFS2_IOCB_UNALIGNED_IO, | ||
81 | OCFS2_IOCB_NUM_LOCKS | 82 | OCFS2_IOCB_NUM_LOCKS |
82 | }; | 83 | }; |
83 | 84 | ||
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits { | |||
91 | clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | 92 | clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) |
92 | #define ocfs2_iocb_is_sem_locked(iocb) \ | 93 | #define ocfs2_iocb_is_sem_locked(iocb) \ |
93 | test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) | 94 | test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) |
95 | |||
96 | #define ocfs2_iocb_set_unaligned_aio(iocb) \ | ||
97 | set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
98 | #define ocfs2_iocb_clear_unaligned_aio(iocb) \ | ||
99 | clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
100 | #define ocfs2_iocb_is_unaligned_aio(iocb) \ | ||
101 | test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) | ||
102 | |||
103 | #define OCFS2_IOEND_WQ_HASH_SZ 37 | ||
104 | #define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\ | ||
105 | OCFS2_IOEND_WQ_HASH_SZ]) | ||
106 | extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; | ||
107 | |||
94 | #endif /* OCFS2_FILE_H */ | 108 | #endif /* OCFS2_FILE_H */ |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 9a3e6bbff27..a4e855e3690 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -216,6 +216,7 @@ struct o2hb_region { | |||
216 | 216 | ||
217 | struct list_head hr_all_item; | 217 | struct list_head hr_all_item; |
218 | unsigned hr_unclean_stop:1, | 218 | unsigned hr_unclean_stop:1, |
219 | hr_aborted_start:1, | ||
219 | hr_item_pinned:1, | 220 | hr_item_pinned:1, |
220 | hr_item_dropped:1; | 221 | hr_item_dropped:1; |
221 | 222 | ||
@@ -254,6 +255,10 @@ struct o2hb_region { | |||
254 | * a more complete api that doesn't lead to this sort of fragility. */ | 255 | * a more complete api that doesn't lead to this sort of fragility. */ |
255 | atomic_t hr_steady_iterations; | 256 | atomic_t hr_steady_iterations; |
256 | 257 | ||
258 | /* terminate o2hb thread if it does not reach steady state | ||
259 | * (hr_steady_iterations == 0) within hr_unsteady_iterations */ | ||
260 | atomic_t hr_unsteady_iterations; | ||
261 | |||
257 | char hr_dev_name[BDEVNAME_SIZE]; | 262 | char hr_dev_name[BDEVNAME_SIZE]; |
258 | 263 | ||
259 | unsigned int hr_timeout_ms; | 264 | unsigned int hr_timeout_ms; |
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work) | |||
324 | 329 | ||
325 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) | 330 | static void o2hb_arm_write_timeout(struct o2hb_region *reg) |
326 | { | 331 | { |
332 | /* Arm writeout only after thread reaches steady state */ | ||
333 | if (atomic_read(®->hr_steady_iterations) != 0) | ||
334 | return; | ||
335 | |||
327 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", | 336 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", |
328 | O2HB_MAX_WRITE_TIMEOUT_MS); | 337 | O2HB_MAX_WRITE_TIMEOUT_MS); |
329 | 338 | ||
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg, | |||
537 | return read == computed; | 546 | return read == computed; |
538 | } | 547 | } |
539 | 548 | ||
540 | /* We want to make sure that nobody is heartbeating on top of us -- | 549 | /* |
541 | * this will help detect an invalid configuration. */ | 550 | * Compare the slot data with what we wrote in the last iteration. |
542 | static void o2hb_check_last_timestamp(struct o2hb_region *reg) | 551 | * If the match fails, print an appropriate error message. This is to |
552 | * detect errors like... another node hearting on the same slot, | ||
553 | * flaky device that is losing writes, etc. | ||
554 | * Returns 1 if check succeeds, 0 otherwise. | ||
555 | */ | ||
556 | static int o2hb_check_own_slot(struct o2hb_region *reg) | ||
543 | { | 557 | { |
544 | struct o2hb_disk_slot *slot; | 558 | struct o2hb_disk_slot *slot; |
545 | struct o2hb_disk_heartbeat_block *hb_block; | 559 | struct o2hb_disk_heartbeat_block *hb_block; |
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) | |||
548 | slot = ®->hr_slots[o2nm_this_node()]; | 562 | slot = ®->hr_slots[o2nm_this_node()]; |
549 | /* Don't check on our 1st timestamp */ | 563 | /* Don't check on our 1st timestamp */ |
550 | if (!slot->ds_last_time) | 564 | if (!slot->ds_last_time) |
551 | return; | 565 | return 0; |
552 | 566 | ||
553 | hb_block = slot->ds_raw_block; | 567 | hb_block = slot->ds_raw_block; |
554 | if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && | 568 | if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && |
555 | le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && | 569 | le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && |
556 | hb_block->hb_node == slot->ds_node_num) | 570 | hb_block->hb_node == slot->ds_node_num) |
557 | return; | 571 | return 1; |
558 | 572 | ||
559 | #define ERRSTR1 "Another node is heartbeating on device" | 573 | #define ERRSTR1 "Another node is heartbeating on device" |
560 | #define ERRSTR2 "Heartbeat generation mismatch on device" | 574 | #define ERRSTR2 "Heartbeat generation mismatch on device" |
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg) | |||
574 | (unsigned long long)slot->ds_last_time, hb_block->hb_node, | 588 | (unsigned long long)slot->ds_last_time, hb_block->hb_node, |
575 | (unsigned long long)le64_to_cpu(hb_block->hb_generation), | 589 | (unsigned long long)le64_to_cpu(hb_block->hb_generation), |
576 | (unsigned long long)le64_to_cpu(hb_block->hb_seq)); | 590 | (unsigned long long)le64_to_cpu(hb_block->hb_seq)); |
591 | |||
592 | return 0; | ||
577 | } | 593 | } |
578 | 594 | ||
579 | static inline void o2hb_prepare_block(struct o2hb_region *reg, | 595 | static inline void o2hb_prepare_block(struct o2hb_region *reg, |
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) | |||
719 | o2nm_node_put(node); | 735 | o2nm_node_put(node); |
720 | } | 736 | } |
721 | 737 | ||
722 | static void o2hb_set_quorum_device(struct o2hb_region *reg, | 738 | static void o2hb_set_quorum_device(struct o2hb_region *reg) |
723 | struct o2hb_disk_slot *slot) | ||
724 | { | 739 | { |
725 | assert_spin_locked(&o2hb_live_lock); | ||
726 | |||
727 | if (!o2hb_global_heartbeat_active()) | 740 | if (!o2hb_global_heartbeat_active()) |
728 | return; | 741 | return; |
729 | 742 | ||
730 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | 743 | /* Prevent race with o2hb_heartbeat_group_drop_item() */ |
744 | if (kthread_should_stop()) | ||
745 | return; | ||
746 | |||
747 | /* Tag region as quorum only after thread reaches steady state */ | ||
748 | if (atomic_read(®->hr_steady_iterations) != 0) | ||
731 | return; | 749 | return; |
732 | 750 | ||
751 | spin_lock(&o2hb_live_lock); | ||
752 | |||
753 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
754 | goto unlock; | ||
755 | |||
733 | /* | 756 | /* |
734 | * A region can be added to the quorum only when it sees all | 757 | * A region can be added to the quorum only when it sees all |
735 | * live nodes heartbeat on it. In other words, the region has been | 758 | * live nodes heartbeat on it. In other words, the region has been |
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, | |||
737 | */ | 760 | */ |
738 | if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, | 761 | if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, |
739 | sizeof(o2hb_live_node_bitmap))) | 762 | sizeof(o2hb_live_node_bitmap))) |
740 | return; | 763 | goto unlock; |
741 | |||
742 | if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD) | ||
743 | return; | ||
744 | 764 | ||
745 | printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", | 765 | printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", |
746 | config_item_name(®->hr_item)); | 766 | config_item_name(®->hr_item), reg->hr_dev_name); |
747 | 767 | ||
748 | set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | 768 | set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); |
749 | 769 | ||
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg, | |||
754 | if (o2hb_pop_count(&o2hb_quorum_region_bitmap, | 774 | if (o2hb_pop_count(&o2hb_quorum_region_bitmap, |
755 | O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) | 775 | O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) |
756 | o2hb_region_unpin(NULL); | 776 | o2hb_region_unpin(NULL); |
777 | unlock: | ||
778 | spin_unlock(&o2hb_live_lock); | ||
757 | } | 779 | } |
758 | 780 | ||
759 | static int o2hb_check_slot(struct o2hb_region *reg, | 781 | static int o2hb_check_slot(struct o2hb_region *reg, |
@@ -925,8 +947,6 @@ fire_callbacks: | |||
925 | slot->ds_equal_samples = 0; | 947 | slot->ds_equal_samples = 0; |
926 | } | 948 | } |
927 | out: | 949 | out: |
928 | o2hb_set_quorum_device(reg, slot); | ||
929 | |||
930 | spin_unlock(&o2hb_live_lock); | 950 | spin_unlock(&o2hb_live_lock); |
931 | 951 | ||
932 | o2hb_run_event_list(&event); | 952 | o2hb_run_event_list(&event); |
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes, | |||
957 | 977 | ||
958 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | 978 | static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) |
959 | { | 979 | { |
960 | int i, ret, highest_node, change = 0; | 980 | int i, ret, highest_node; |
981 | int membership_change = 0, own_slot_ok = 0; | ||
961 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 982 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
962 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 983 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
963 | struct o2hb_bio_wait_ctxt write_wc; | 984 | struct o2hb_bio_wait_ctxt write_wc; |
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
966 | sizeof(configured_nodes)); | 987 | sizeof(configured_nodes)); |
967 | if (ret) { | 988 | if (ret) { |
968 | mlog_errno(ret); | 989 | mlog_errno(ret); |
969 | return ret; | 990 | goto bail; |
970 | } | 991 | } |
971 | 992 | ||
972 | /* | 993 | /* |
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
982 | 1003 | ||
983 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | 1004 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); |
984 | if (highest_node >= O2NM_MAX_NODES) { | 1005 | if (highest_node >= O2NM_MAX_NODES) { |
985 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | 1006 | mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); |
986 | return -EINVAL; | 1007 | ret = -EINVAL; |
1008 | goto bail; | ||
987 | } | 1009 | } |
988 | 1010 | ||
989 | /* No sense in reading the slots of nodes that don't exist | 1011 | /* No sense in reading the slots of nodes that don't exist |
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
993 | ret = o2hb_read_slots(reg, highest_node + 1); | 1015 | ret = o2hb_read_slots(reg, highest_node + 1); |
994 | if (ret < 0) { | 1016 | if (ret < 0) { |
995 | mlog_errno(ret); | 1017 | mlog_errno(ret); |
996 | return ret; | 1018 | goto bail; |
997 | } | 1019 | } |
998 | 1020 | ||
999 | /* With an up to date view of the slots, we can check that no | 1021 | /* With an up to date view of the slots, we can check that no |
1000 | * other node has been improperly configured to heartbeat in | 1022 | * other node has been improperly configured to heartbeat in |
1001 | * our slot. */ | 1023 | * our slot. */ |
1002 | o2hb_check_last_timestamp(reg); | 1024 | own_slot_ok = o2hb_check_own_slot(reg); |
1003 | 1025 | ||
1004 | /* fill in the proper info for our next heartbeat */ | 1026 | /* fill in the proper info for our next heartbeat */ |
1005 | o2hb_prepare_block(reg, reg->hr_generation); | 1027 | o2hb_prepare_block(reg, reg->hr_generation); |
1006 | 1028 | ||
1007 | /* And fire off the write. Note that we don't wait on this I/O | ||
1008 | * until later. */ | ||
1009 | ret = o2hb_issue_node_write(reg, &write_wc); | 1029 | ret = o2hb_issue_node_write(reg, &write_wc); |
1010 | if (ret < 0) { | 1030 | if (ret < 0) { |
1011 | mlog_errno(ret); | 1031 | mlog_errno(ret); |
1012 | return ret; | 1032 | goto bail; |
1013 | } | 1033 | } |
1014 | 1034 | ||
1015 | i = -1; | 1035 | i = -1; |
1016 | while((i = find_next_bit(configured_nodes, | 1036 | while((i = find_next_bit(configured_nodes, |
1017 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | 1037 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { |
1018 | change |= o2hb_check_slot(reg, ®->hr_slots[i]); | 1038 | membership_change |= o2hb_check_slot(reg, ®->hr_slots[i]); |
1019 | } | 1039 | } |
1020 | 1040 | ||
1021 | /* | 1041 | /* |
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
1030 | * disk */ | 1050 | * disk */ |
1031 | mlog(ML_ERROR, "Write error %d on device \"%s\"\n", | 1051 | mlog(ML_ERROR, "Write error %d on device \"%s\"\n", |
1032 | write_wc.wc_error, reg->hr_dev_name); | 1052 | write_wc.wc_error, reg->hr_dev_name); |
1033 | return write_wc.wc_error; | 1053 | ret = write_wc.wc_error; |
1054 | goto bail; | ||
1034 | } | 1055 | } |
1035 | 1056 | ||
1036 | o2hb_arm_write_timeout(reg); | 1057 | /* Skip disarming the timeout if own slot has stale/bad data */ |
1058 | if (own_slot_ok) { | ||
1059 | o2hb_set_quorum_device(reg); | ||
1060 | o2hb_arm_write_timeout(reg); | ||
1061 | } | ||
1037 | 1062 | ||
1063 | bail: | ||
1038 | /* let the person who launched us know when things are steady */ | 1064 | /* let the person who launched us know when things are steady */ |
1039 | if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { | 1065 | if (atomic_read(®->hr_steady_iterations) != 0) { |
1040 | if (atomic_dec_and_test(®->hr_steady_iterations)) | 1066 | if (!ret && own_slot_ok && !membership_change) { |
1067 | if (atomic_dec_and_test(®->hr_steady_iterations)) | ||
1068 | wake_up(&o2hb_steady_queue); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | if (atomic_read(®->hr_steady_iterations) != 0) { | ||
1073 | if (atomic_dec_and_test(®->hr_unsteady_iterations)) { | ||
1074 | printk(KERN_NOTICE "o2hb: Unable to stabilize " | ||
1075 | "heartbeart on region %s (%s)\n", | ||
1076 | config_item_name(®->hr_item), | ||
1077 | reg->hr_dev_name); | ||
1078 | atomic_set(®->hr_steady_iterations, 0); | ||
1079 | reg->hr_aborted_start = 1; | ||
1041 | wake_up(&o2hb_steady_queue); | 1080 | wake_up(&o2hb_steady_queue); |
1081 | ret = -EIO; | ||
1082 | } | ||
1042 | } | 1083 | } |
1043 | 1084 | ||
1044 | return 0; | 1085 | return ret; |
1045 | } | 1086 | } |
1046 | 1087 | ||
1047 | /* Subtract b from a, storing the result in a. a *must* have a larger | 1088 | /* Subtract b from a, storing the result in a. a *must* have a larger |
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data) | |||
1095 | /* Pin node */ | 1136 | /* Pin node */ |
1096 | o2nm_depend_this_node(); | 1137 | o2nm_depend_this_node(); |
1097 | 1138 | ||
1098 | while (!kthread_should_stop() && !reg->hr_unclean_stop) { | 1139 | while (!kthread_should_stop() && |
1140 | !reg->hr_unclean_stop && !reg->hr_aborted_start) { | ||
1099 | /* We track the time spent inside | 1141 | /* We track the time spent inside |
1100 | * o2hb_do_disk_heartbeat so that we avoid more than | 1142 | * o2hb_do_disk_heartbeat so that we avoid more than |
1101 | * hr_timeout_ms between disk writes. On busy systems | 1143 | * hr_timeout_ms between disk writes. On busy systems |
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data) | |||
1103 | * likely to time itself out. */ | 1145 | * likely to time itself out. */ |
1104 | do_gettimeofday(&before_hb); | 1146 | do_gettimeofday(&before_hb); |
1105 | 1147 | ||
1106 | i = 0; | 1148 | ret = o2hb_do_disk_heartbeat(reg); |
1107 | do { | ||
1108 | ret = o2hb_do_disk_heartbeat(reg); | ||
1109 | } while (ret && ++i < 2); | ||
1110 | 1149 | ||
1111 | do_gettimeofday(&after_hb); | 1150 | do_gettimeofday(&after_hb); |
1112 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); | 1151 | elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); |
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data) | |||
1117 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, | 1156 | after_hb.tv_sec, (unsigned long) after_hb.tv_usec, |
1118 | elapsed_msec); | 1157 | elapsed_msec); |
1119 | 1158 | ||
1120 | if (elapsed_msec < reg->hr_timeout_ms) { | 1159 | if (!kthread_should_stop() && |
1160 | elapsed_msec < reg->hr_timeout_ms) { | ||
1121 | /* the kthread api has blocked signals for us so no | 1161 | /* the kthread api has blocked signals for us so no |
1122 | * need to record the return value. */ | 1162 | * need to record the return value. */ |
1123 | msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); | 1163 | msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); |
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data) | |||
1134 | * to timeout on this region when we could just as easily | 1174 | * to timeout on this region when we could just as easily |
1135 | * write a clear generation - thus indicating to them that | 1175 | * write a clear generation - thus indicating to them that |
1136 | * this node has left this region. | 1176 | * this node has left this region. |
1137 | * | 1177 | */ |
1138 | * XXX: Should we skip this on unclean_stop? */ | 1178 | if (!reg->hr_unclean_stop && !reg->hr_aborted_start) { |
1139 | o2hb_prepare_block(reg, 0); | 1179 | o2hb_prepare_block(reg, 0); |
1140 | ret = o2hb_issue_node_write(reg, &write_wc); | 1180 | ret = o2hb_issue_node_write(reg, &write_wc); |
1141 | if (ret == 0) { | 1181 | if (ret == 0) |
1142 | o2hb_wait_on_io(reg, &write_wc); | 1182 | o2hb_wait_on_io(reg, &write_wc); |
1143 | } else { | 1183 | else |
1144 | mlog_errno(ret); | 1184 | mlog_errno(ret); |
1145 | } | 1185 | } |
1146 | 1186 | ||
1147 | /* Unpin node */ | 1187 | /* Unpin node */ |
1148 | o2nm_undepend_this_node(); | 1188 | o2nm_undepend_this_node(); |
1149 | 1189 | ||
1150 | mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); | 1190 | mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n"); |
1151 | 1191 | ||
1152 | return 0; | 1192 | return 0; |
1153 | } | 1193 | } |
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) | |||
1158 | struct o2hb_debug_buf *db = inode->i_private; | 1198 | struct o2hb_debug_buf *db = inode->i_private; |
1159 | struct o2hb_region *reg; | 1199 | struct o2hb_region *reg; |
1160 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 1200 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
1201 | unsigned long lts; | ||
1161 | char *buf = NULL; | 1202 | char *buf = NULL; |
1162 | int i = -1; | 1203 | int i = -1; |
1163 | int out = 0; | 1204 | int out = 0; |
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) | |||
1194 | 1235 | ||
1195 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: | 1236 | case O2HB_DB_TYPE_REGION_ELAPSED_TIME: |
1196 | reg = (struct o2hb_region *)db->db_data; | 1237 | reg = (struct o2hb_region *)db->db_data; |
1197 | out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", | 1238 | lts = reg->hr_last_timeout_start; |
1198 | jiffies_to_msecs(jiffies - | 1239 | /* If 0, it has never been set before */ |
1199 | reg->hr_last_timeout_start)); | 1240 | if (lts) |
1241 | lts = jiffies_to_msecs(jiffies - lts); | ||
1242 | out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); | ||
1200 | goto done; | 1243 | goto done; |
1201 | 1244 | ||
1202 | case O2HB_DB_TYPE_REGION_PINNED: | 1245 | case O2HB_DB_TYPE_REGION_PINNED: |
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item) | |||
1426 | struct page *page; | 1469 | struct page *page; |
1427 | struct o2hb_region *reg = to_o2hb_region(item); | 1470 | struct o2hb_region *reg = to_o2hb_region(item); |
1428 | 1471 | ||
1472 | mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); | ||
1473 | |||
1429 | if (reg->hr_tmp_block) | 1474 | if (reg->hr_tmp_block) |
1430 | kfree(reg->hr_tmp_block); | 1475 | kfree(reg->hr_tmp_block); |
1431 | 1476 | ||
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1792 | live_threshold <<= 1; | 1837 | live_threshold <<= 1; |
1793 | spin_unlock(&o2hb_live_lock); | 1838 | spin_unlock(&o2hb_live_lock); |
1794 | } | 1839 | } |
1795 | atomic_set(®->hr_steady_iterations, live_threshold + 1); | 1840 | ++live_threshold; |
1841 | atomic_set(®->hr_steady_iterations, live_threshold); | ||
1842 | /* unsteady_iterations is double the steady_iterations */ | ||
1843 | atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); | ||
1796 | 1844 | ||
1797 | hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", | 1845 | hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", |
1798 | reg->hr_item.ci_name); | 1846 | reg->hr_item.ci_name); |
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1809 | ret = wait_event_interruptible(o2hb_steady_queue, | 1857 | ret = wait_event_interruptible(o2hb_steady_queue, |
1810 | atomic_read(®->hr_steady_iterations) == 0); | 1858 | atomic_read(®->hr_steady_iterations) == 0); |
1811 | if (ret) { | 1859 | if (ret) { |
1812 | /* We got interrupted (hello ptrace!). Clean up */ | 1860 | atomic_set(®->hr_steady_iterations, 0); |
1813 | spin_lock(&o2hb_live_lock); | 1861 | reg->hr_aborted_start = 1; |
1814 | hb_task = reg->hr_task; | 1862 | } |
1815 | reg->hr_task = NULL; | ||
1816 | spin_unlock(&o2hb_live_lock); | ||
1817 | 1863 | ||
1818 | if (hb_task) | 1864 | if (reg->hr_aborted_start) { |
1819 | kthread_stop(hb_task); | 1865 | ret = -EIO; |
1820 | goto out; | 1866 | goto out; |
1821 | } | 1867 | } |
1822 | 1868 | ||
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, | |||
1833 | ret = -EIO; | 1879 | ret = -EIO; |
1834 | 1880 | ||
1835 | if (hb_task && o2hb_global_heartbeat_active()) | 1881 | if (hb_task && o2hb_global_heartbeat_active()) |
1836 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", | 1882 | printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", |
1837 | config_item_name(®->hr_item)); | 1883 | config_item_name(®->hr_item), reg->hr_dev_name); |
1838 | 1884 | ||
1839 | out: | 1885 | out: |
1840 | if (filp) | 1886 | if (filp) |
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
2092 | 2138 | ||
2093 | /* stop the thread when the user removes the region dir */ | 2139 | /* stop the thread when the user removes the region dir */ |
2094 | spin_lock(&o2hb_live_lock); | 2140 | spin_lock(&o2hb_live_lock); |
2095 | if (o2hb_global_heartbeat_active()) { | ||
2096 | clear_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
2097 | clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
2098 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
2099 | quorum_region = 1; | ||
2100 | clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | ||
2101 | } | ||
2102 | hb_task = reg->hr_task; | 2141 | hb_task = reg->hr_task; |
2103 | reg->hr_task = NULL; | 2142 | reg->hr_task = NULL; |
2104 | reg->hr_item_dropped = 1; | 2143 | reg->hr_item_dropped = 1; |
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, | |||
2107 | if (hb_task) | 2146 | if (hb_task) |
2108 | kthread_stop(hb_task); | 2147 | kthread_stop(hb_task); |
2109 | 2148 | ||
2149 | if (o2hb_global_heartbeat_active()) { | ||
2150 | spin_lock(&o2hb_live_lock); | ||
2151 | clear_bit(reg->hr_region_num, o2hb_region_bitmap); | ||
2152 | clear_bit(reg->hr_region_num, o2hb_live_region_bitmap); | ||
2153 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
2154 | quorum_region = 1; | ||
2155 | clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); | ||
2156 | spin_unlock(&o2hb_live_lock); | ||
2157 | printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", | ||
2158 | ((atomic_read(®->hr_steady_iterations) == 0) ? | ||
2159 | "stopped" : "start aborted"), config_item_name(item), | ||
2160 | reg->hr_dev_name); | ||
2161 | } | ||
2162 | |||
2110 | /* | 2163 | /* |
2111 | * If we're racing a dev_write(), we need to wake them. They will | 2164 | * If we're racing a dev_write(), we need to wake them. They will |
2112 | * check reg->hr_task | 2165 | * check reg->hr_task |
2113 | */ | 2166 | */ |
2114 | if (atomic_read(®->hr_steady_iterations) != 0) { | 2167 | if (atomic_read(®->hr_steady_iterations) != 0) { |
2168 | reg->hr_aborted_start = 1; | ||
2115 | atomic_set(®->hr_steady_iterations, 0); | 2169 | atomic_set(®->hr_steady_iterations, 0); |
2116 | wake_up(&o2hb_steady_queue); | 2170 | wake_up(&o2hb_steady_queue); |
2117 | } | 2171 | } |
2118 | 2172 | ||
2119 | if (o2hb_global_heartbeat_active()) | ||
2120 | printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n", | ||
2121 | config_item_name(®->hr_item)); | ||
2122 | |||
2123 | config_item_put(item); | 2173 | config_item_put(item); |
2124 | 2174 | ||
2125 | if (!o2hb_global_heartbeat_active() || !quorum_region) | 2175 | if (!o2hb_global_heartbeat_active() || !quorum_region) |
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 3a5835904b3..dc45deb19e6 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #define SC_DEBUG_NAME "sock_containers" | 47 | #define SC_DEBUG_NAME "sock_containers" |
48 | #define NST_DEBUG_NAME "send_tracking" | 48 | #define NST_DEBUG_NAME "send_tracking" |
49 | #define STATS_DEBUG_NAME "stats" | 49 | #define STATS_DEBUG_NAME "stats" |
50 | #define NODES_DEBUG_NAME "connected_nodes" | ||
50 | 51 | ||
51 | #define SHOW_SOCK_CONTAINERS 0 | 52 | #define SHOW_SOCK_CONTAINERS 0 |
52 | #define SHOW_SOCK_STATS 1 | 53 | #define SHOW_SOCK_STATS 1 |
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry; | |||
55 | static struct dentry *sc_dentry; | 56 | static struct dentry *sc_dentry; |
56 | static struct dentry *nst_dentry; | 57 | static struct dentry *nst_dentry; |
57 | static struct dentry *stats_dentry; | 58 | static struct dentry *stats_dentry; |
59 | static struct dentry *nodes_dentry; | ||
58 | 60 | ||
59 | static DEFINE_SPINLOCK(o2net_debug_lock); | 61 | static DEFINE_SPINLOCK(o2net_debug_lock); |
60 | 62 | ||
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = { | |||
491 | .release = sc_fop_release, | 493 | .release = sc_fop_release, |
492 | }; | 494 | }; |
493 | 495 | ||
494 | int o2net_debugfs_init(void) | 496 | static int o2net_fill_bitmap(char *buf, int len) |
495 | { | 497 | { |
496 | o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); | 498 | unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
497 | if (!o2net_dentry) { | 499 | int i = -1, out = 0; |
498 | mlog_errno(-ENOMEM); | ||
499 | goto bail; | ||
500 | } | ||
501 | 500 | ||
502 | nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, | 501 | o2net_fill_node_map(map, sizeof(map)); |
503 | o2net_dentry, NULL, | ||
504 | &nst_seq_fops); | ||
505 | if (!nst_dentry) { | ||
506 | mlog_errno(-ENOMEM); | ||
507 | goto bail; | ||
508 | } | ||
509 | 502 | ||
510 | sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, | 503 | while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) |
511 | o2net_dentry, NULL, | 504 | out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); |
512 | &sc_seq_fops); | 505 | out += snprintf(buf + out, PAGE_SIZE - out, "\n"); |
513 | if (!sc_dentry) { | ||
514 | mlog_errno(-ENOMEM); | ||
515 | goto bail; | ||
516 | } | ||
517 | 506 | ||
518 | stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, | 507 | return out; |
519 | o2net_dentry, NULL, | 508 | } |
520 | &stats_seq_fops); | 509 | |
521 | if (!stats_dentry) { | 510 | static int nodes_fop_open(struct inode *inode, struct file *file) |
522 | mlog_errno(-ENOMEM); | 511 | { |
523 | goto bail; | 512 | char *buf; |
524 | } | 513 | |
514 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
515 | if (!buf) | ||
516 | return -ENOMEM; | ||
517 | |||
518 | i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE)); | ||
519 | |||
520 | file->private_data = buf; | ||
525 | 521 | ||
526 | return 0; | 522 | return 0; |
527 | bail: | ||
528 | debugfs_remove(stats_dentry); | ||
529 | debugfs_remove(sc_dentry); | ||
530 | debugfs_remove(nst_dentry); | ||
531 | debugfs_remove(o2net_dentry); | ||
532 | return -ENOMEM; | ||
533 | } | 523 | } |
534 | 524 | ||
525 | static int o2net_debug_release(struct inode *inode, struct file *file) | ||
526 | { | ||
527 | kfree(file->private_data); | ||
528 | return 0; | ||
529 | } | ||
530 | |||
531 | static ssize_t o2net_debug_read(struct file *file, char __user *buf, | ||
532 | size_t nbytes, loff_t *ppos) | ||
533 | { | ||
534 | return simple_read_from_buffer(buf, nbytes, ppos, file->private_data, | ||
535 | i_size_read(file->f_mapping->host)); | ||
536 | } | ||
537 | |||
538 | static const struct file_operations nodes_fops = { | ||
539 | .open = nodes_fop_open, | ||
540 | .release = o2net_debug_release, | ||
541 | .read = o2net_debug_read, | ||
542 | .llseek = generic_file_llseek, | ||
543 | }; | ||
544 | |||
535 | void o2net_debugfs_exit(void) | 545 | void o2net_debugfs_exit(void) |
536 | { | 546 | { |
547 | debugfs_remove(nodes_dentry); | ||
537 | debugfs_remove(stats_dentry); | 548 | debugfs_remove(stats_dentry); |
538 | debugfs_remove(sc_dentry); | 549 | debugfs_remove(sc_dentry); |
539 | debugfs_remove(nst_dentry); | 550 | debugfs_remove(nst_dentry); |
540 | debugfs_remove(o2net_dentry); | 551 | debugfs_remove(o2net_dentry); |
541 | } | 552 | } |
542 | 553 | ||
554 | int o2net_debugfs_init(void) | ||
555 | { | ||
556 | mode_t mode = S_IFREG|S_IRUSR; | ||
557 | |||
558 | o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); | ||
559 | if (o2net_dentry) | ||
560 | nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, | ||
561 | o2net_dentry, NULL, &nst_seq_fops); | ||
562 | if (nst_dentry) | ||
563 | sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, | ||
564 | o2net_dentry, NULL, &sc_seq_fops); | ||
565 | if (sc_dentry) | ||
566 | stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, | ||
567 | o2net_dentry, NULL, &stats_seq_fops); | ||
568 | if (stats_dentry) | ||
569 | nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, | ||
570 | o2net_dentry, NULL, &nodes_fops); | ||
571 | if (nodes_dentry) | ||
572 | return 0; | ||
573 | |||
574 | o2net_debugfs_exit(); | ||
575 | mlog_errno(-ENOMEM); | ||
576 | return -ENOMEM; | ||
577 | } | ||
578 | |||
543 | #endif /* CONFIG_DEBUG_FS */ | 579 | #endif /* CONFIG_DEBUG_FS */ |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index db5ee4b4f47..044e7b58d31 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/idr.h> | 59 | #include <linux/idr.h> |
60 | #include <linux/kref.h> | 60 | #include <linux/kref.h> |
61 | #include <linux/net.h> | 61 | #include <linux/net.h> |
62 | #include <linux/export.h> | ||
62 | #include <net/tcp.h> | 63 | #include <net/tcp.h> |
63 | 64 | ||
64 | #include <asm/uaccess.h> | 65 | #include <asm/uaccess.h> |
@@ -545,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
545 | } | 546 | } |
546 | 547 | ||
547 | if (was_valid && !valid) { | 548 | if (was_valid && !valid) { |
548 | printk(KERN_NOTICE "o2net: no longer connected to " | 549 | printk(KERN_NOTICE "o2net: No longer connected to " |
549 | SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); | 550 | SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); |
550 | o2net_complete_nodes_nsw(nn); | 551 | o2net_complete_nodes_nsw(nn); |
551 | } | 552 | } |
@@ -555,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
555 | cancel_delayed_work(&nn->nn_connect_expired); | 556 | cancel_delayed_work(&nn->nn_connect_expired); |
556 | printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", | 557 | printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", |
557 | o2nm_this_node() > sc->sc_node->nd_num ? | 558 | o2nm_this_node() > sc->sc_node->nd_num ? |
558 | "connected to" : "accepted connection from", | 559 | "Connected to" : "Accepted connection from", |
559 | SC_NODEF_ARGS(sc)); | 560 | SC_NODEF_ARGS(sc)); |
560 | } | 561 | } |
561 | 562 | ||
@@ -643,7 +644,7 @@ static void o2net_state_change(struct sock *sk) | |||
643 | o2net_sc_queue_work(sc, &sc->sc_connect_work); | 644 | o2net_sc_queue_work(sc, &sc->sc_connect_work); |
644 | break; | 645 | break; |
645 | default: | 646 | default: |
646 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT | 647 | printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT |
647 | " shutdown, state %d\n", | 648 | " shutdown, state %d\n", |
648 | SC_NODEF_ARGS(sc), sk->sk_state); | 649 | SC_NODEF_ARGS(sc), sk->sk_state); |
649 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | 650 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); |
@@ -1034,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, | |||
1034 | return ret; | 1035 | return ret; |
1035 | } | 1036 | } |
1036 | 1037 | ||
1038 | /* Get a map of all nodes to which this node is currently connected to */ | ||
1039 | void o2net_fill_node_map(unsigned long *map, unsigned bytes) | ||
1040 | { | ||
1041 | struct o2net_sock_container *sc; | ||
1042 | int node, ret; | ||
1043 | |||
1044 | BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); | ||
1045 | |||
1046 | memset(map, 0, bytes); | ||
1047 | for (node = 0; node < O2NM_MAX_NODES; ++node) { | ||
1048 | o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); | ||
1049 | if (!ret) { | ||
1050 | set_bit(node, map); | ||
1051 | sc_put(sc); | ||
1052 | } | ||
1053 | } | ||
1054 | } | ||
1055 | EXPORT_SYMBOL_GPL(o2net_fill_node_map); | ||
1056 | |||
1037 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | 1057 | int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, |
1038 | size_t caller_veclen, u8 target_node, int *status) | 1058 | size_t caller_veclen, u8 target_node, int *status) |
1039 | { | 1059 | { |
@@ -1284,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1284 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | 1304 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); |
1285 | 1305 | ||
1286 | if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { | 1306 | if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { |
1287 | mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " | 1307 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net " |
1288 | "version %llu but %llu is required, disconnecting\n", | 1308 | "protocol version %llu but %llu is required. " |
1289 | SC_NODEF_ARGS(sc), | 1309 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1290 | (unsigned long long)be64_to_cpu(hand->protocol_version), | 1310 | (unsigned long long)be64_to_cpu(hand->protocol_version), |
1291 | O2NET_PROTOCOL_VERSION); | 1311 | O2NET_PROTOCOL_VERSION); |
1292 | 1312 | ||
1293 | /* don't bother reconnecting if its the wrong version. */ | 1313 | /* don't bother reconnecting if its the wrong version. */ |
1294 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1314 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
@@ -1302,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1302 | */ | 1322 | */ |
1303 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != | 1323 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != |
1304 | o2net_idle_timeout()) { | 1324 | o2net_idle_timeout()) { |
1305 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " | 1325 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network " |
1306 | "%u ms, but we use %u ms locally. disconnecting\n", | 1326 | "idle timeout of %u ms, but we use %u ms locally. " |
1307 | SC_NODEF_ARGS(sc), | 1327 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1308 | be32_to_cpu(hand->o2net_idle_timeout_ms), | 1328 | be32_to_cpu(hand->o2net_idle_timeout_ms), |
1309 | o2net_idle_timeout()); | 1329 | o2net_idle_timeout()); |
1310 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1330 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1311 | return -1; | 1331 | return -1; |
1312 | } | 1332 | } |
1313 | 1333 | ||
1314 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != | 1334 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != |
1315 | o2net_keepalive_delay()) { | 1335 | o2net_keepalive_delay()) { |
1316 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " | 1336 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive " |
1317 | "%u ms, but we use %u ms locally. disconnecting\n", | 1337 | "delay of %u ms, but we use %u ms locally. " |
1318 | SC_NODEF_ARGS(sc), | 1338 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1319 | be32_to_cpu(hand->o2net_keepalive_delay_ms), | 1339 | be32_to_cpu(hand->o2net_keepalive_delay_ms), |
1320 | o2net_keepalive_delay()); | 1340 | o2net_keepalive_delay()); |
1321 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1341 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1322 | return -1; | 1342 | return -1; |
1323 | } | 1343 | } |
1324 | 1344 | ||
1325 | if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != | 1345 | if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != |
1326 | O2HB_MAX_WRITE_TIMEOUT_MS) { | 1346 | O2HB_MAX_WRITE_TIMEOUT_MS) { |
1327 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " | 1347 | printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat " |
1328 | "%u ms, but we use %u ms locally. disconnecting\n", | 1348 | "timeout of %u ms, but we use %u ms locally. " |
1329 | SC_NODEF_ARGS(sc), | 1349 | "Disconnecting.\n", SC_NODEF_ARGS(sc), |
1330 | be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), | 1350 | be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), |
1331 | O2HB_MAX_WRITE_TIMEOUT_MS); | 1351 | O2HB_MAX_WRITE_TIMEOUT_MS); |
1332 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1352 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1333 | return -1; | 1353 | return -1; |
1334 | } | 1354 | } |
@@ -1539,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data) | |||
1539 | { | 1559 | { |
1540 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; | 1560 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; |
1541 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | 1561 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); |
1542 | |||
1543 | #ifdef CONFIG_DEBUG_FS | 1562 | #ifdef CONFIG_DEBUG_FS |
1544 | ktime_t now = ktime_get(); | 1563 | unsigned long msecs = ktime_to_ms(ktime_get()) - |
1564 | ktime_to_ms(sc->sc_tv_timer); | ||
1565 | #else | ||
1566 | unsigned long msecs = o2net_idle_timeout(); | ||
1545 | #endif | 1567 | #endif |
1546 | 1568 | ||
1547 | printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " | 1569 | printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " |
1548 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), | 1570 | "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), |
1549 | o2net_idle_timeout() / 1000, | 1571 | msecs / 1000, msecs % 1000); |
1550 | o2net_idle_timeout() % 1000); | ||
1551 | |||
1552 | #ifdef CONFIG_DEBUG_FS | ||
1553 | mlog(ML_NOTICE, "Here are some times that might help debug the " | ||
1554 | "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, " | ||
1555 | "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n", | ||
1556 | (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now), | ||
1557 | (long long)ktime_to_us(sc->sc_tv_data_ready), | ||
1558 | (long long)ktime_to_us(sc->sc_tv_advance_start), | ||
1559 | (long long)ktime_to_us(sc->sc_tv_advance_stop), | ||
1560 | sc->sc_msg_key, sc->sc_msg_type, | ||
1561 | (long long)ktime_to_us(sc->sc_tv_func_start), | ||
1562 | (long long)ktime_to_us(sc->sc_tv_func_stop)); | ||
1563 | #endif | ||
1564 | 1572 | ||
1565 | /* | 1573 | /* |
1566 | * Initialize the nn_timeout so that the next connection attempt | 1574 | * Initialize the nn_timeout so that the next connection attempt |
@@ -1693,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work) | |||
1693 | 1701 | ||
1694 | out: | 1702 | out: |
1695 | if (ret) { | 1703 | if (ret) { |
1696 | mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " | 1704 | printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT |
1697 | "with errno %d\n", SC_NODEF_ARGS(sc), ret); | 1705 | " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); |
1698 | /* 0 err so that another will be queued and attempted | 1706 | /* 0 err so that another will be queued and attempted |
1699 | * from set_nn_state */ | 1707 | * from set_nn_state */ |
1700 | if (sc) | 1708 | if (sc) |
@@ -1717,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1717 | 1725 | ||
1718 | spin_lock(&nn->nn_lock); | 1726 | spin_lock(&nn->nn_lock); |
1719 | if (!nn->nn_sc_valid) { | 1727 | if (!nn->nn_sc_valid) { |
1720 | mlog(ML_ERROR, "no connection established with node %u after " | 1728 | printk(KERN_NOTICE "o2net: No connection established with " |
1721 | "%u.%u seconds, giving up and returning errors.\n", | 1729 | "node %u after %u.%u seconds, giving up.\n", |
1722 | o2net_num_from_nn(nn), | 1730 | o2net_num_from_nn(nn), |
1723 | o2net_idle_timeout() / 1000, | 1731 | o2net_idle_timeout() / 1000, |
1724 | o2net_idle_timeout() % 1000); | 1732 | o2net_idle_timeout() % 1000); |
@@ -1861,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock) | |||
1861 | 1869 | ||
1862 | node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); | 1870 | node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); |
1863 | if (node == NULL) { | 1871 | if (node == NULL) { |
1864 | mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", | 1872 | printk(KERN_NOTICE "o2net: Attempt to connect from unknown " |
1865 | &sin.sin_addr.s_addr, ntohs(sin.sin_port)); | 1873 | "node at %pI4:%d\n", &sin.sin_addr.s_addr, |
1874 | ntohs(sin.sin_port)); | ||
1866 | ret = -EINVAL; | 1875 | ret = -EINVAL; |
1867 | goto out; | 1876 | goto out; |
1868 | } | 1877 | } |
1869 | 1878 | ||
1870 | if (o2nm_this_node() >= node->nd_num) { | 1879 | if (o2nm_this_node() >= node->nd_num) { |
1871 | local_node = o2nm_get_node_by_num(o2nm_this_node()); | 1880 | local_node = o2nm_get_node_by_num(o2nm_this_node()); |
1872 | mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" | 1881 | printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " |
1873 | "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", | 1882 | "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " |
1874 | local_node->nd_name, local_node->nd_num, | 1883 | "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, |
1875 | &(local_node->nd_ipv4_address), | 1884 | &(local_node->nd_ipv4_address), |
1876 | ntohs(local_node->nd_ipv4_port), | 1885 | ntohs(local_node->nd_ipv4_port), node->nd_name, |
1877 | node->nd_name, node->nd_num, &sin.sin_addr.s_addr, | 1886 | node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); |
1878 | ntohs(sin.sin_port)); | ||
1879 | ret = -EINVAL; | 1887 | ret = -EINVAL; |
1880 | goto out; | 1888 | goto out; |
1881 | } | 1889 | } |
@@ -1900,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock) | |||
1900 | ret = 0; | 1908 | ret = 0; |
1901 | spin_unlock(&nn->nn_lock); | 1909 | spin_unlock(&nn->nn_lock); |
1902 | if (ret) { | 1910 | if (ret) { |
1903 | mlog(ML_NOTICE, "attempt to connect from node '%s' at " | 1911 | printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' " |
1904 | "%pI4:%d but it already has an open connection\n", | 1912 | "at %pI4:%d but it already has an open connection\n", |
1905 | node->nd_name, &sin.sin_addr.s_addr, | 1913 | node->nd_name, &sin.sin_addr.s_addr, |
1906 | ntohs(sin.sin_port)); | 1914 | ntohs(sin.sin_port)); |
1907 | goto out; | 1915 | goto out; |
1908 | } | 1916 | } |
1909 | 1917 | ||
@@ -1983,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) | |||
1983 | 1991 | ||
1984 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); | 1992 | ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); |
1985 | if (ret < 0) { | 1993 | if (ret < 0) { |
1986 | mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); | 1994 | printk(KERN_ERR "o2net: Error %d while creating socket\n", ret); |
1987 | goto out; | 1995 | goto out; |
1988 | } | 1996 | } |
1989 | 1997 | ||
@@ -2000,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) | |||
2000 | sock->sk->sk_reuse = 1; | 2008 | sock->sk->sk_reuse = 1; |
2001 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); | 2009 | ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); |
2002 | if (ret < 0) { | 2010 | if (ret < 0) { |
2003 | mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " | 2011 | printk(KERN_ERR "o2net: Error %d while binding socket at " |
2004 | "ret=%d\n", &addr, ntohs(port), ret); | 2012 | "%pI4:%u\n", ret, &addr, ntohs(port)); |
2005 | goto out; | 2013 | goto out; |
2006 | } | 2014 | } |
2007 | 2015 | ||
2008 | ret = sock->ops->listen(sock, 64); | 2016 | ret = sock->ops->listen(sock, 64); |
2009 | if (ret < 0) { | 2017 | if (ret < 0) |
2010 | mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", | 2018 | printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n", |
2011 | &addr, ntohs(port), ret); | 2019 | ret, &addr, ntohs(port)); |
2012 | } | ||
2013 | 2020 | ||
2014 | out: | 2021 | out: |
2015 | if (ret) { | 2022 | if (ret) { |
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index fd6179eb26d..5bada2a69b5 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h | |||
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, | |||
106 | struct list_head *unreg_list); | 106 | struct list_head *unreg_list); |
107 | void o2net_unregister_handler_list(struct list_head *list); | 107 | void o2net_unregister_handler_list(struct list_head *list); |
108 | 108 | ||
109 | void o2net_fill_node_map(unsigned long *map, unsigned bytes); | ||
110 | |||
109 | struct o2nm_node; | 111 | struct o2nm_node; |
110 | int o2net_register_hb_callbacks(void); | 112 | int o2net_register_hb_callbacks(void); |
111 | void o2net_unregister_hb_callbacks(void); | 113 | void o2net_unregister_hb_callbacks(void); |
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index e2878b5895f..8fe4e2892ab 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, | |||
1184 | if (pde) | 1184 | if (pde) |
1185 | le16_add_cpu(&pde->rec_len, | 1185 | le16_add_cpu(&pde->rec_len, |
1186 | le16_to_cpu(de->rec_len)); | 1186 | le16_to_cpu(de->rec_len)); |
1187 | else | 1187 | de->inode = 0; |
1188 | de->inode = 0; | ||
1189 | dir->i_version++; | 1188 | dir->i_version++; |
1190 | ocfs2_journal_dirty(handle, bh); | 1189 | ocfs2_journal_dirty(handle, bh); |
1191 | goto bail; | 1190 | goto bail; |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d602abb51b6..a5952ceecba 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); | |||
859 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm); | 859 | void dlm_wait_for_recovery(struct dlm_ctxt *dlm); |
860 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); | 860 | void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); |
861 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); | 861 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); |
862 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); | 862 | void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); |
863 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); | 863 | void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); |
864 | 864 | ||
865 | void dlm_put(struct dlm_ctxt *dlm); | 865 | void dlm_put(struct dlm_ctxt *dlm); |
866 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); | 866 | struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); |
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res) | |||
877 | kref_get(&res->refs); | 877 | kref_get(&res->refs); |
878 | } | 878 | } |
879 | void dlm_lockres_put(struct dlm_lock_resource *res); | 879 | void dlm_lockres_put(struct dlm_lock_resource *res); |
880 | void __dlm_unhash_lockres(struct dlm_lock_resource *res); | 880 | void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); |
881 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | 881 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); |
882 | struct dlm_lock_resource *res); | ||
883 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, | 882 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, |
884 | const char *name, | 883 | const char *name, |
885 | unsigned int len, | 884 | unsigned int len, |
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
902 | const char *name, | 901 | const char *name, |
903 | unsigned int namelen); | 902 | unsigned int namelen); |
904 | 903 | ||
905 | #define dlm_lockres_set_refmap_bit(bit,res) \ | 904 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, |
906 | __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) | 905 | struct dlm_lock_resource *res, int bit); |
907 | #define dlm_lockres_clear_refmap_bit(bit,res) \ | 906 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, |
908 | __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) | 907 | struct dlm_lock_resource *res, int bit); |
909 | 908 | ||
910 | static inline void __dlm_lockres_set_refmap_bit(int bit, | 909 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, |
911 | struct dlm_lock_resource *res, | 910 | struct dlm_lock_resource *res); |
912 | const char *file, | 911 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, |
913 | int line) | 912 | struct dlm_lock_resource *res); |
914 | { | ||
915 | //printk("%s:%d:%.*s: setting bit %d\n", file, line, | ||
916 | // res->lockname.len, res->lockname.name, bit); | ||
917 | set_bit(bit, res->refmap); | ||
918 | } | ||
919 | |||
920 | static inline void __dlm_lockres_clear_refmap_bit(int bit, | ||
921 | struct dlm_lock_resource *res, | ||
922 | const char *file, | ||
923 | int line) | ||
924 | { | ||
925 | //printk("%s:%d:%.*s: clearing bit %d\n", file, line, | ||
926 | // res->lockname.len, res->lockname.name, bit); | ||
927 | clear_bit(bit, res->refmap); | ||
928 | } | ||
929 | |||
930 | void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | ||
931 | struct dlm_lock_resource *res, | ||
932 | const char *file, | ||
933 | int line); | ||
934 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
935 | struct dlm_lock_resource *res, | ||
936 | int new_lockres, | ||
937 | const char *file, | ||
938 | int line); | ||
939 | #define dlm_lockres_drop_inflight_ref(d,r) \ | ||
940 | __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__) | ||
941 | #define dlm_lockres_grab_inflight_ref(d,r) \ | ||
942 | __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__) | ||
943 | #define dlm_lockres_grab_inflight_ref_new(d,r) \ | ||
944 | __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__) | ||
945 | 913 | ||
946 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | 914 | void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
947 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); | 915 | void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 56f82cb912e..0e28e242226 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/sysctl.h> | 30 | #include <linux/sysctl.h> |
31 | #include <linux/spinlock.h> | 31 | #include <linux/spinlock.h> |
32 | #include <linux/debugfs.h> | 32 | #include <linux/debugfs.h> |
33 | #include <linux/export.h> | ||
33 | 34 | ||
34 | #include "cluster/heartbeat.h" | 35 | #include "cluster/heartbeat.h" |
35 | #include "cluster/nodemanager.h" | 36 | #include "cluster/nodemanager.h" |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6ed6b95dcf9..92f2ead0fab 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing, | |||
157 | 157 | ||
158 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); | 158 | static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); |
159 | 159 | ||
160 | void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) | 160 | void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
161 | { | 161 | { |
162 | if (!hlist_unhashed(&lockres->hash_node)) { | 162 | if (hlist_unhashed(&res->hash_node)) |
163 | hlist_del_init(&lockres->hash_node); | 163 | return; |
164 | dlm_lockres_put(lockres); | 164 | |
165 | } | 165 | mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len, |
166 | res->lockname.name); | ||
167 | hlist_del_init(&res->hash_node); | ||
168 | dlm_lockres_put(res); | ||
166 | } | 169 | } |
167 | 170 | ||
168 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, | 171 | void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) |
169 | struct dlm_lock_resource *res) | ||
170 | { | 172 | { |
171 | struct hlist_head *bucket; | 173 | struct hlist_head *bucket; |
172 | struct qstr *q; | 174 | struct qstr *q; |
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, | |||
180 | dlm_lockres_get(res); | 182 | dlm_lockres_get(res); |
181 | 183 | ||
182 | hlist_add_head(&res->hash_node, bucket); | 184 | hlist_add_head(&res->hash_node, bucket); |
185 | |||
186 | mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len, | ||
187 | res->lockname.name); | ||
183 | } | 188 | } |
184 | 189 | ||
185 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, | 190 | struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, |
@@ -539,17 +544,17 @@ again: | |||
539 | 544 | ||
540 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) | 545 | static void __dlm_print_nodes(struct dlm_ctxt *dlm) |
541 | { | 546 | { |
542 | int node = -1; | 547 | int node = -1, num = 0; |
543 | 548 | ||
544 | assert_spin_locked(&dlm->spinlock); | 549 | assert_spin_locked(&dlm->spinlock); |
545 | 550 | ||
546 | printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); | 551 | printk("( "); |
547 | |||
548 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, | 552 | while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, |
549 | node + 1)) < O2NM_MAX_NODES) { | 553 | node + 1)) < O2NM_MAX_NODES) { |
550 | printk("%d ", node); | 554 | printk("%d ", node); |
555 | ++num; | ||
551 | } | 556 | } |
552 | printk("\n"); | 557 | printk(") %u nodes\n", num); |
553 | } | 558 | } |
554 | 559 | ||
555 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, | 560 | static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, |
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, | |||
566 | 571 | ||
567 | node = exit_msg->node_idx; | 572 | node = exit_msg->node_idx; |
568 | 573 | ||
569 | printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name); | ||
570 | |||
571 | spin_lock(&dlm->spinlock); | 574 | spin_lock(&dlm->spinlock); |
572 | clear_bit(node, dlm->domain_map); | 575 | clear_bit(node, dlm->domain_map); |
573 | clear_bit(node, dlm->exit_domain_map); | 576 | clear_bit(node, dlm->exit_domain_map); |
577 | printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name); | ||
574 | __dlm_print_nodes(dlm); | 578 | __dlm_print_nodes(dlm); |
575 | 579 | ||
576 | /* notify anything attached to the heartbeat events */ | 580 | /* notify anything attached to the heartbeat events */ |
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) | |||
755 | 759 | ||
756 | dlm_mark_domain_leaving(dlm); | 760 | dlm_mark_domain_leaving(dlm); |
757 | dlm_leave_domain(dlm); | 761 | dlm_leave_domain(dlm); |
762 | printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name); | ||
758 | dlm_force_free_mles(dlm); | 763 | dlm_force_free_mles(dlm); |
759 | dlm_complete_dlm_shutdown(dlm); | 764 | dlm_complete_dlm_shutdown(dlm); |
760 | } | 765 | } |
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, | |||
970 | clear_bit(assert->node_idx, dlm->exit_domain_map); | 975 | clear_bit(assert->node_idx, dlm->exit_domain_map); |
971 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 976 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); |
972 | 977 | ||
973 | printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", | 978 | printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ", |
974 | assert->node_idx, dlm->name); | 979 | assert->node_idx, dlm->name); |
975 | __dlm_print_nodes(dlm); | 980 | __dlm_print_nodes(dlm); |
976 | 981 | ||
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) | |||
1701 | bail: | 1706 | bail: |
1702 | spin_lock(&dlm->spinlock); | 1707 | spin_lock(&dlm->spinlock); |
1703 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); | 1708 | __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); |
1704 | if (!status) | 1709 | if (!status) { |
1710 | printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name); | ||
1705 | __dlm_print_nodes(dlm); | 1711 | __dlm_print_nodes(dlm); |
1712 | } | ||
1706 | spin_unlock(&dlm->spinlock); | 1713 | spin_unlock(&dlm->spinlock); |
1707 | 1714 | ||
1708 | if (ctxt) { | 1715 | if (ctxt) { |
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, | |||
2131 | goto leave; | 2138 | goto leave; |
2132 | } | 2139 | } |
2133 | 2140 | ||
2134 | if (!o2hb_check_local_node_heartbeating()) { | ||
2135 | mlog(ML_ERROR, "the local node has not been configured, or is " | ||
2136 | "not heartbeating\n"); | ||
2137 | ret = -EPROTO; | ||
2138 | goto leave; | ||
2139 | } | ||
2140 | |||
2141 | mlog(0, "register called for domain \"%s\"\n", domain); | 2141 | mlog(0, "register called for domain \"%s\"\n", domain); |
2142 | 2142 | ||
2143 | retry: | 2143 | retry: |
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 8d39e0fd66f..975810b9849 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, | |||
183 | kick_thread = 1; | 183 | kick_thread = 1; |
184 | } | 184 | } |
185 | } | 185 | } |
186 | /* reduce the inflight count, this may result in the lockres | ||
187 | * being purged below during calc_usage */ | ||
188 | if (lock->ml.node == dlm->node_num) | ||
189 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
190 | 186 | ||
191 | spin_unlock(&res->spinlock); | 187 | spin_unlock(&res->spinlock); |
192 | wake_up(&res->wq); | 188 | wake_up(&res->wq); |
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, | |||
231 | lock->ml.type, res->lockname.len, | 227 | lock->ml.type, res->lockname.len, |
232 | res->lockname.name, flags); | 228 | res->lockname.name, flags); |
233 | 229 | ||
230 | /* | ||
231 | * Wait if resource is getting recovered, remastered, etc. | ||
232 | * If the resource was remastered and new owner is self, then exit. | ||
233 | */ | ||
234 | spin_lock(&res->spinlock); | 234 | spin_lock(&res->spinlock); |
235 | |||
236 | /* will exit this call with spinlock held */ | ||
237 | __dlm_wait_on_lockres(res); | 235 | __dlm_wait_on_lockres(res); |
236 | if (res->owner == dlm->node_num) { | ||
237 | spin_unlock(&res->spinlock); | ||
238 | return DLM_RECOVERING; | ||
239 | } | ||
238 | res->state |= DLM_LOCK_RES_IN_PROGRESS; | 240 | res->state |= DLM_LOCK_RES_IN_PROGRESS; |
239 | 241 | ||
240 | /* add lock to local (secondary) queue */ | 242 | /* add lock to local (secondary) queue */ |
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, | |||
319 | tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, | 321 | tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, |
320 | sizeof(create), res->owner, &status); | 322 | sizeof(create), res->owner, &status); |
321 | if (tmpret >= 0) { | 323 | if (tmpret >= 0) { |
322 | // successfully sent and received | 324 | ret = status; |
323 | ret = status; // this is already a dlm_status | ||
324 | if (ret == DLM_REJECTED) { | 325 | if (ret == DLM_REJECTED) { |
325 | mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " | 326 | mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer " |
326 | "no longer owned by %u. that node is coming back " | 327 | "owned by node %u. That node is coming back up " |
327 | "up currently.\n", dlm->name, create.namelen, | 328 | "currently.\n", dlm->name, create.namelen, |
328 | create.name, res->owner); | 329 | create.name, res->owner); |
329 | dlm_print_one_lock_resource(res); | 330 | dlm_print_one_lock_resource(res); |
330 | BUG(); | 331 | BUG(); |
331 | } | 332 | } |
332 | } else { | 333 | } else { |
333 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " | 334 | mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to " |
334 | "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, | 335 | "node %u\n", dlm->name, create.namelen, create.name, |
335 | res->owner); | 336 | tmpret, res->owner); |
336 | if (dlm_is_host_down(tmpret)) { | 337 | if (dlm_is_host_down(tmpret)) |
337 | ret = DLM_RECOVERING; | 338 | ret = DLM_RECOVERING; |
338 | mlog(0, "node %u died so returning DLM_RECOVERING " | 339 | else |
339 | "from lock message!\n", res->owner); | ||
340 | } else { | ||
341 | ret = dlm_err_to_dlm_status(tmpret); | 340 | ret = dlm_err_to_dlm_status(tmpret); |
342 | } | ||
343 | } | 341 | } |
344 | 342 | ||
345 | return ret; | 343 | return ret; |
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | |||
440 | /* zero memory only if kernel-allocated */ | 438 | /* zero memory only if kernel-allocated */ |
441 | lksb = kzalloc(sizeof(*lksb), GFP_NOFS); | 439 | lksb = kzalloc(sizeof(*lksb), GFP_NOFS); |
442 | if (!lksb) { | 440 | if (!lksb) { |
443 | kfree(lock); | 441 | kmem_cache_free(dlm_lock_cache, lock); |
444 | return NULL; | 442 | return NULL; |
445 | } | 443 | } |
446 | kernel_allocated = 1; | 444 | kernel_allocated = 1; |
@@ -718,18 +716,10 @@ retry_lock: | |||
718 | 716 | ||
719 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || | 717 | if (status == DLM_RECOVERING || status == DLM_MIGRATING || |
720 | status == DLM_FORWARD) { | 718 | status == DLM_FORWARD) { |
721 | mlog(0, "retrying lock with migration/" | ||
722 | "recovery/in progress\n"); | ||
723 | msleep(100); | 719 | msleep(100); |
724 | /* no waiting for dlm_reco_thread */ | ||
725 | if (recovery) { | 720 | if (recovery) { |
726 | if (status != DLM_RECOVERING) | 721 | if (status != DLM_RECOVERING) |
727 | goto retry_lock; | 722 | goto retry_lock; |
728 | |||
729 | mlog(0, "%s: got RECOVERING " | ||
730 | "for $RECOVERY lock, master " | ||
731 | "was %u\n", dlm->name, | ||
732 | res->owner); | ||
733 | /* wait to see the node go down, then | 723 | /* wait to see the node go down, then |
734 | * drop down and allow the lockres to | 724 | * drop down and allow the lockres to |
735 | * get cleaned up. need to remaster. */ | 725 | * get cleaned up. need to remaster. */ |
@@ -741,6 +731,14 @@ retry_lock: | |||
741 | } | 731 | } |
742 | } | 732 | } |
743 | 733 | ||
734 | /* Inflight taken in dlm_get_lock_resource() is dropped here */ | ||
735 | spin_lock(&res->spinlock); | ||
736 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
737 | spin_unlock(&res->spinlock); | ||
738 | |||
739 | dlm_lockres_calc_usage(dlm, res); | ||
740 | dlm_kick_thread(dlm, res); | ||
741 | |||
744 | if (status != DLM_NORMAL) { | 742 | if (status != DLM_NORMAL) { |
745 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; | 743 | lock->lksb->flags &= ~DLM_LKSB_GET_LVB; |
746 | if (status != DLM_NOTQUEUED) | 744 | if (status != DLM_NOTQUEUED) |
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 11eefb8c12e..005261c333b 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -631,39 +631,54 @@ error: | |||
631 | return NULL; | 631 | return NULL; |
632 | } | 632 | } |
633 | 633 | ||
634 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | 634 | void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, |
635 | struct dlm_lock_resource *res, | 635 | struct dlm_lock_resource *res, int bit) |
636 | int new_lockres, | ||
637 | const char *file, | ||
638 | int line) | ||
639 | { | 636 | { |
640 | if (!new_lockres) | 637 | assert_spin_locked(&res->spinlock); |
641 | assert_spin_locked(&res->spinlock); | 638 | |
639 | mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, | ||
640 | res->lockname.name, bit, __builtin_return_address(0)); | ||
641 | |||
642 | set_bit(bit, res->refmap); | ||
643 | } | ||
644 | |||
645 | void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, | ||
646 | struct dlm_lock_resource *res, int bit) | ||
647 | { | ||
648 | assert_spin_locked(&res->spinlock); | ||
649 | |||
650 | mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, | ||
651 | res->lockname.name, bit, __builtin_return_address(0)); | ||
652 | |||
653 | clear_bit(bit, res->refmap); | ||
654 | } | ||
655 | |||
656 | |||
657 | void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
658 | struct dlm_lock_resource *res) | ||
659 | { | ||
660 | assert_spin_locked(&res->spinlock); | ||
642 | 661 | ||
643 | if (!test_bit(dlm->node_num, res->refmap)) { | ||
644 | BUG_ON(res->inflight_locks != 0); | ||
645 | dlm_lockres_set_refmap_bit(dlm->node_num, res); | ||
646 | } | ||
647 | res->inflight_locks++; | 662 | res->inflight_locks++; |
648 | mlog(0, "%s:%.*s: inflight++: now %u\n", | 663 | |
649 | dlm->name, res->lockname.len, res->lockname.name, | 664 | mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, |
650 | res->inflight_locks); | 665 | res->lockname.len, res->lockname.name, res->inflight_locks, |
666 | __builtin_return_address(0)); | ||
651 | } | 667 | } |
652 | 668 | ||
653 | void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | 669 | void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, |
654 | struct dlm_lock_resource *res, | 670 | struct dlm_lock_resource *res) |
655 | const char *file, | ||
656 | int line) | ||
657 | { | 671 | { |
658 | assert_spin_locked(&res->spinlock); | 672 | assert_spin_locked(&res->spinlock); |
659 | 673 | ||
660 | BUG_ON(res->inflight_locks == 0); | 674 | BUG_ON(res->inflight_locks == 0); |
675 | |||
661 | res->inflight_locks--; | 676 | res->inflight_locks--; |
662 | mlog(0, "%s:%.*s: inflight--: now %u\n", | 677 | |
663 | dlm->name, res->lockname.len, res->lockname.name, | 678 | mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, |
664 | res->inflight_locks); | 679 | res->lockname.len, res->lockname.name, res->inflight_locks, |
665 | if (res->inflight_locks == 0) | 680 | __builtin_return_address(0)); |
666 | dlm_lockres_clear_refmap_bit(dlm->node_num, res); | 681 | |
667 | wake_up(&res->wq); | 682 | wake_up(&res->wq); |
668 | } | 683 | } |
669 | 684 | ||
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
697 | unsigned int hash; | 712 | unsigned int hash; |
698 | int tries = 0; | 713 | int tries = 0; |
699 | int bit, wait_on_recovery = 0; | 714 | int bit, wait_on_recovery = 0; |
700 | int drop_inflight_if_nonlocal = 0; | ||
701 | 715 | ||
702 | BUG_ON(!lockid); | 716 | BUG_ON(!lockid); |
703 | 717 | ||
@@ -709,36 +723,33 @@ lookup: | |||
709 | spin_lock(&dlm->spinlock); | 723 | spin_lock(&dlm->spinlock); |
710 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); | 724 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); |
711 | if (tmpres) { | 725 | if (tmpres) { |
712 | int dropping_ref = 0; | ||
713 | |||
714 | spin_unlock(&dlm->spinlock); | 726 | spin_unlock(&dlm->spinlock); |
715 | |||
716 | spin_lock(&tmpres->spinlock); | 727 | spin_lock(&tmpres->spinlock); |
717 | /* We wait for the other thread that is mastering the resource */ | 728 | /* Wait on the thread that is mastering the resource */ |
718 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { | 729 | if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { |
719 | __dlm_wait_on_lockres(tmpres); | 730 | __dlm_wait_on_lockres(tmpres); |
720 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); | 731 | BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); |
732 | spin_unlock(&tmpres->spinlock); | ||
733 | dlm_lockres_put(tmpres); | ||
734 | tmpres = NULL; | ||
735 | goto lookup; | ||
721 | } | 736 | } |
722 | 737 | ||
723 | if (tmpres->owner == dlm->node_num) { | 738 | /* Wait on the resource purge to complete before continuing */ |
724 | BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); | 739 | if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { |
725 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | 740 | BUG_ON(tmpres->owner == dlm->node_num); |
726 | } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) | 741 | __dlm_wait_on_lockres_flags(tmpres, |
727 | dropping_ref = 1; | 742 | DLM_LOCK_RES_DROPPING_REF); |
728 | spin_unlock(&tmpres->spinlock); | ||
729 | |||
730 | /* wait until done messaging the master, drop our ref to allow | ||
731 | * the lockres to be purged, start over. */ | ||
732 | if (dropping_ref) { | ||
733 | spin_lock(&tmpres->spinlock); | ||
734 | __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); | ||
735 | spin_unlock(&tmpres->spinlock); | 743 | spin_unlock(&tmpres->spinlock); |
736 | dlm_lockres_put(tmpres); | 744 | dlm_lockres_put(tmpres); |
737 | tmpres = NULL; | 745 | tmpres = NULL; |
738 | goto lookup; | 746 | goto lookup; |
739 | } | 747 | } |
740 | 748 | ||
741 | mlog(0, "found in hash!\n"); | 749 | /* Grab inflight ref to pin the resource */ |
750 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | ||
751 | |||
752 | spin_unlock(&tmpres->spinlock); | ||
742 | if (res) | 753 | if (res) |
743 | dlm_lockres_put(res); | 754 | dlm_lockres_put(res); |
744 | res = tmpres; | 755 | res = tmpres; |
@@ -829,8 +840,8 @@ lookup: | |||
829 | * but they might own this lockres. wait on them. */ | 840 | * but they might own this lockres. wait on them. */ |
830 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 841 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
831 | if (bit < O2NM_MAX_NODES) { | 842 | if (bit < O2NM_MAX_NODES) { |
832 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " | 843 | mlog(0, "%s: res %.*s, At least one node (%d) " |
833 | "recover before lock mastery can begin\n", | 844 | "to recover before lock mastery can begin\n", |
834 | dlm->name, namelen, (char *)lockid, bit); | 845 | dlm->name, namelen, (char *)lockid, bit); |
835 | wait_on_recovery = 1; | 846 | wait_on_recovery = 1; |
836 | } | 847 | } |
@@ -843,12 +854,11 @@ lookup: | |||
843 | 854 | ||
844 | /* finally add the lockres to its hash bucket */ | 855 | /* finally add the lockres to its hash bucket */ |
845 | __dlm_insert_lockres(dlm, res); | 856 | __dlm_insert_lockres(dlm, res); |
846 | /* since this lockres is new it doesn't not require the spinlock */ | ||
847 | dlm_lockres_grab_inflight_ref_new(dlm, res); | ||
848 | 857 | ||
849 | /* if this node does not become the master make sure to drop | 858 | /* Grab inflight ref to pin the resource */ |
850 | * this inflight reference below */ | 859 | spin_lock(&res->spinlock); |
851 | drop_inflight_if_nonlocal = 1; | 860 | dlm_lockres_grab_inflight_ref(dlm, res); |
861 | spin_unlock(&res->spinlock); | ||
852 | 862 | ||
853 | /* get an extra ref on the mle in case this is a BLOCK | 863 | /* get an extra ref on the mle in case this is a BLOCK |
854 | * if so, the creator of the BLOCK may try to put the last | 864 | * if so, the creator of the BLOCK may try to put the last |
@@ -864,8 +874,8 @@ redo_request: | |||
864 | * dlm spinlock would be detectable be a change on the mle, | 874 | * dlm spinlock would be detectable be a change on the mle, |
865 | * so we only need to clear out the recovery map once. */ | 875 | * so we only need to clear out the recovery map once. */ |
866 | if (dlm_is_recovery_lock(lockid, namelen)) { | 876 | if (dlm_is_recovery_lock(lockid, namelen)) { |
867 | mlog(ML_NOTICE, "%s: recovery map is not empty, but " | 877 | mlog(0, "%s: Recovery map is not empty, but must " |
868 | "must master $RECOVERY lock now\n", dlm->name); | 878 | "master $RECOVERY lock now\n", dlm->name); |
869 | if (!dlm_pre_master_reco_lockres(dlm, res)) | 879 | if (!dlm_pre_master_reco_lockres(dlm, res)) |
870 | wait_on_recovery = 0; | 880 | wait_on_recovery = 0; |
871 | else { | 881 | else { |
@@ -883,8 +893,8 @@ redo_request: | |||
883 | spin_lock(&dlm->spinlock); | 893 | spin_lock(&dlm->spinlock); |
884 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | 894 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); |
885 | if (bit < O2NM_MAX_NODES) { | 895 | if (bit < O2NM_MAX_NODES) { |
886 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " | 896 | mlog(0, "%s: res %.*s, At least one node (%d) " |
887 | "recover before lock mastery can begin\n", | 897 | "to recover before lock mastery can begin\n", |
888 | dlm->name, namelen, (char *)lockid, bit); | 898 | dlm->name, namelen, (char *)lockid, bit); |
889 | wait_on_recovery = 1; | 899 | wait_on_recovery = 1; |
890 | } else | 900 | } else |
@@ -913,8 +923,8 @@ redo_request: | |||
913 | * yet, keep going until it does. this is how the | 923 | * yet, keep going until it does. this is how the |
914 | * master will know that asserts are needed back to | 924 | * master will know that asserts are needed back to |
915 | * the lower nodes. */ | 925 | * the lower nodes. */ |
916 | mlog(0, "%s:%.*s: requests only up to %u but master " | 926 | mlog(0, "%s: res %.*s, Requests only up to %u but " |
917 | "is %u, keep going\n", dlm->name, namelen, | 927 | "master is %u, keep going\n", dlm->name, namelen, |
918 | lockid, nodenum, mle->master); | 928 | lockid, nodenum, mle->master); |
919 | } | 929 | } |
920 | } | 930 | } |
@@ -924,13 +934,12 @@ wait: | |||
924 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); | 934 | ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); |
925 | if (ret < 0) { | 935 | if (ret < 0) { |
926 | wait_on_recovery = 1; | 936 | wait_on_recovery = 1; |
927 | mlog(0, "%s:%.*s: node map changed, redo the " | 937 | mlog(0, "%s: res %.*s, Node map changed, redo the master " |
928 | "master request now, blocked=%d\n", | 938 | "request now, blocked=%d\n", dlm->name, res->lockname.len, |
929 | dlm->name, res->lockname.len, | ||
930 | res->lockname.name, blocked); | 939 | res->lockname.name, blocked); |
931 | if (++tries > 20) { | 940 | if (++tries > 20) { |
932 | mlog(ML_ERROR, "%s:%.*s: spinning on " | 941 | mlog(ML_ERROR, "%s: res %.*s, Spinning on " |
933 | "dlm_wait_for_lock_mastery, blocked=%d\n", | 942 | "dlm_wait_for_lock_mastery, blocked = %d\n", |
934 | dlm->name, res->lockname.len, | 943 | dlm->name, res->lockname.len, |
935 | res->lockname.name, blocked); | 944 | res->lockname.name, blocked); |
936 | dlm_print_one_lock_resource(res); | 945 | dlm_print_one_lock_resource(res); |
@@ -940,7 +949,8 @@ wait: | |||
940 | goto redo_request; | 949 | goto redo_request; |
941 | } | 950 | } |
942 | 951 | ||
943 | mlog(0, "lockres mastered by %u\n", res->owner); | 952 | mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, |
953 | res->lockname.name, res->owner); | ||
944 | /* make sure we never continue without this */ | 954 | /* make sure we never continue without this */ |
945 | BUG_ON(res->owner == O2NM_MAX_NODES); | 955 | BUG_ON(res->owner == O2NM_MAX_NODES); |
946 | 956 | ||
@@ -952,8 +962,6 @@ wait: | |||
952 | 962 | ||
953 | wake_waiters: | 963 | wake_waiters: |
954 | spin_lock(&res->spinlock); | 964 | spin_lock(&res->spinlock); |
955 | if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) | ||
956 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
957 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 965 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
958 | spin_unlock(&res->spinlock); | 966 | spin_unlock(&res->spinlock); |
959 | wake_up(&res->wq); | 967 | wake_up(&res->wq); |
@@ -1426,9 +1434,7 @@ way_up_top: | |||
1426 | } | 1434 | } |
1427 | 1435 | ||
1428 | if (res->owner == dlm->node_num) { | 1436 | if (res->owner == dlm->node_num) { |
1429 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | 1437 | dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); |
1430 | dlm->name, namelen, name, request->node_idx); | ||
1431 | dlm_lockres_set_refmap_bit(request->node_idx, res); | ||
1432 | spin_unlock(&res->spinlock); | 1438 | spin_unlock(&res->spinlock); |
1433 | response = DLM_MASTER_RESP_YES; | 1439 | response = DLM_MASTER_RESP_YES; |
1434 | if (mle) | 1440 | if (mle) |
@@ -1493,10 +1499,8 @@ way_up_top: | |||
1493 | * go back and clean the mles on any | 1499 | * go back and clean the mles on any |
1494 | * other nodes */ | 1500 | * other nodes */ |
1495 | dispatch_assert = 1; | 1501 | dispatch_assert = 1; |
1496 | dlm_lockres_set_refmap_bit(request->node_idx, res); | 1502 | dlm_lockres_set_refmap_bit(dlm, res, |
1497 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | 1503 | request->node_idx); |
1498 | dlm->name, namelen, name, | ||
1499 | request->node_idx); | ||
1500 | } else | 1504 | } else |
1501 | response = DLM_MASTER_RESP_NO; | 1505 | response = DLM_MASTER_RESP_NO; |
1502 | } else { | 1506 | } else { |
@@ -1702,7 +1706,7 @@ again: | |||
1702 | "lockres, set the bit in the refmap\n", | 1706 | "lockres, set the bit in the refmap\n", |
1703 | namelen, lockname, to); | 1707 | namelen, lockname, to); |
1704 | spin_lock(&res->spinlock); | 1708 | spin_lock(&res->spinlock); |
1705 | dlm_lockres_set_refmap_bit(to, res); | 1709 | dlm_lockres_set_refmap_bit(dlm, res, to); |
1706 | spin_unlock(&res->spinlock); | 1710 | spin_unlock(&res->spinlock); |
1707 | } | 1711 | } |
1708 | } | 1712 | } |
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
2187 | namelen = res->lockname.len; | 2191 | namelen = res->lockname.len; |
2188 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | 2192 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); |
2189 | 2193 | ||
2190 | mlog(0, "%s:%.*s: sending deref to %d\n", | ||
2191 | dlm->name, namelen, lockname, res->owner); | ||
2192 | memset(&deref, 0, sizeof(deref)); | 2194 | memset(&deref, 0, sizeof(deref)); |
2193 | deref.node_idx = dlm->node_num; | 2195 | deref.node_idx = dlm->node_num; |
2194 | deref.namelen = namelen; | 2196 | deref.namelen = namelen; |
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | |||
2197 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, | 2199 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, |
2198 | &deref, sizeof(deref), res->owner, &r); | 2200 | &deref, sizeof(deref), res->owner, &r); |
2199 | if (ret < 0) | 2201 | if (ret < 0) |
2200 | mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " | 2202 | mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", |
2201 | "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, | 2203 | dlm->name, namelen, lockname, ret, res->owner); |
2202 | res->owner); | ||
2203 | else if (r < 0) { | 2204 | else if (r < 0) { |
2204 | /* BAD. other node says I did not have a ref. */ | 2205 | /* BAD. other node says I did not have a ref. */ |
2205 | mlog(ML_ERROR,"while dropping ref on %s:%.*s " | 2206 | mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", |
2206 | "(master=%u) got %d.\n", dlm->name, namelen, | 2207 | dlm->name, namelen, lockname, res->owner, r); |
2207 | lockname, res->owner, r); | ||
2208 | dlm_print_one_lock_resource(res); | 2208 | dlm_print_one_lock_resource(res); |
2209 | BUG(); | 2209 | BUG(); |
2210 | } | 2210 | } |
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, | |||
2260 | else { | 2260 | else { |
2261 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2261 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2262 | if (test_bit(node, res->refmap)) { | 2262 | if (test_bit(node, res->refmap)) { |
2263 | dlm_lockres_clear_refmap_bit(node, res); | 2263 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2264 | cleared = 1; | 2264 | cleared = 1; |
2265 | } | 2265 | } |
2266 | } | 2266 | } |
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) | |||
2320 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | 2320 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); |
2321 | if (test_bit(node, res->refmap)) { | 2321 | if (test_bit(node, res->refmap)) { |
2322 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); | 2322 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); |
2323 | dlm_lockres_clear_refmap_bit(node, res); | 2323 | dlm_lockres_clear_refmap_bit(dlm, res, node); |
2324 | cleared = 1; | 2324 | cleared = 1; |
2325 | } | 2325 | } |
2326 | spin_unlock(&res->spinlock); | 2326 | spin_unlock(&res->spinlock); |
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2802 | BUG_ON(!list_empty(&lock->bast_list)); | 2802 | BUG_ON(!list_empty(&lock->bast_list)); |
2803 | BUG_ON(lock->ast_pending); | 2803 | BUG_ON(lock->ast_pending); |
2804 | BUG_ON(lock->bast_pending); | 2804 | BUG_ON(lock->bast_pending); |
2805 | dlm_lockres_clear_refmap_bit(lock->ml.node, res); | 2805 | dlm_lockres_clear_refmap_bit(dlm, res, |
2806 | lock->ml.node); | ||
2806 | list_del_init(&lock->list); | 2807 | list_del_init(&lock->list); |
2807 | dlm_lock_put(lock); | 2808 | dlm_lock_put(lock); |
2808 | /* In a normal unlock, we would have added a | 2809 | /* In a normal unlock, we would have added a |
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2823 | mlog(0, "%s:%.*s: node %u had a ref to this " | 2824 | mlog(0, "%s:%.*s: node %u had a ref to this " |
2824 | "migrating lockres, clearing\n", dlm->name, | 2825 | "migrating lockres, clearing\n", dlm->name, |
2825 | res->lockname.len, res->lockname.name, bit); | 2826 | res->lockname.len, res->lockname.name, bit); |
2826 | dlm_lockres_clear_refmap_bit(bit, res); | 2827 | dlm_lockres_clear_refmap_bit(dlm, res, bit); |
2827 | } | 2828 | } |
2828 | bit++; | 2829 | bit++; |
2829 | } | 2830 | } |
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2916 | &migrate, sizeof(migrate), nodenum, | 2917 | &migrate, sizeof(migrate), nodenum, |
2917 | &status); | 2918 | &status); |
2918 | if (ret < 0) { | 2919 | if (ret < 0) { |
2919 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 2920 | mlog(ML_ERROR, "%s: res %.*s, Error %d send " |
2920 | "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, | 2921 | "MIGRATE_REQUEST to node %u\n", dlm->name, |
2921 | dlm->key, nodenum); | 2922 | migrate.namelen, migrate.name, ret, nodenum); |
2922 | if (!dlm_is_host_down(ret)) { | 2923 | if (!dlm_is_host_down(ret)) { |
2923 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); | 2924 | mlog(ML_ERROR, "unhandled error=%d!\n", ret); |
2924 | BUG(); | 2925 | BUG(); |
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2937 | dlm->name, res->lockname.len, res->lockname.name, | 2938 | dlm->name, res->lockname.len, res->lockname.name, |
2938 | nodenum); | 2939 | nodenum); |
2939 | spin_lock(&res->spinlock); | 2940 | spin_lock(&res->spinlock); |
2940 | dlm_lockres_set_refmap_bit(nodenum, res); | 2941 | dlm_lockres_set_refmap_bit(dlm, res, nodenum); |
2941 | spin_unlock(&res->spinlock); | 2942 | spin_unlock(&res->spinlock); |
2942 | } | 2943 | } |
2943 | } | 2944 | } |
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
3271 | * mastery reference here since old_master will briefly have | 3272 | * mastery reference here since old_master will briefly have |
3272 | * a reference after the migration completes */ | 3273 | * a reference after the migration completes */ |
3273 | spin_lock(&res->spinlock); | 3274 | spin_lock(&res->spinlock); |
3274 | dlm_lockres_set_refmap_bit(old_master, res); | 3275 | dlm_lockres_set_refmap_bit(dlm, res, old_master); |
3275 | spin_unlock(&res->spinlock); | 3276 | spin_unlock(&res->spinlock); |
3276 | 3277 | ||
3277 | mlog(0, "now time to do a migrate request to other nodes\n"); | 3278 | mlog(0, "now time to do a migrate request to other nodes\n"); |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7efab6d28a2..01ebfd0bdad 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) | |||
362 | } | 362 | } |
363 | 363 | ||
364 | 364 | ||
365 | int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) | 365 | void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) |
366 | { | 366 | { |
367 | if (timeout) { | 367 | if (dlm_is_node_dead(dlm, node)) |
368 | mlog(ML_NOTICE, "%s: waiting %dms for notification of " | 368 | return; |
369 | "death of node %u\n", dlm->name, timeout, node); | 369 | |
370 | printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in " | ||
371 | "domain %s\n", node, dlm->name); | ||
372 | |||
373 | if (timeout) | ||
370 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 374 | wait_event_timeout(dlm->dlm_reco_thread_wq, |
371 | dlm_is_node_dead(dlm, node), | 375 | dlm_is_node_dead(dlm, node), |
372 | msecs_to_jiffies(timeout)); | 376 | msecs_to_jiffies(timeout)); |
373 | } else { | 377 | else |
374 | mlog(ML_NOTICE, "%s: waiting indefinitely for notification " | ||
375 | "of death of node %u\n", dlm->name, node); | ||
376 | wait_event(dlm->dlm_reco_thread_wq, | 378 | wait_event(dlm->dlm_reco_thread_wq, |
377 | dlm_is_node_dead(dlm, node)); | 379 | dlm_is_node_dead(dlm, node)); |
378 | } | ||
379 | /* for now, return 0 */ | ||
380 | return 0; | ||
381 | } | 380 | } |
382 | 381 | ||
383 | int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) | 382 | void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) |
384 | { | 383 | { |
385 | if (timeout) { | 384 | if (dlm_is_node_recovered(dlm, node)) |
386 | mlog(0, "%s: waiting %dms for notification of " | 385 | return; |
387 | "recovery of node %u\n", dlm->name, timeout, node); | 386 | |
387 | printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in " | ||
388 | "domain %s\n", node, dlm->name); | ||
389 | |||
390 | if (timeout) | ||
388 | wait_event_timeout(dlm->dlm_reco_thread_wq, | 391 | wait_event_timeout(dlm->dlm_reco_thread_wq, |
389 | dlm_is_node_recovered(dlm, node), | 392 | dlm_is_node_recovered(dlm, node), |
390 | msecs_to_jiffies(timeout)); | 393 | msecs_to_jiffies(timeout)); |
391 | } else { | 394 | else |
392 | mlog(0, "%s: waiting indefinitely for notification " | ||
393 | "of recovery of node %u\n", dlm->name, node); | ||
394 | wait_event(dlm->dlm_reco_thread_wq, | 395 | wait_event(dlm->dlm_reco_thread_wq, |
395 | dlm_is_node_recovered(dlm, node)); | 396 | dlm_is_node_recovered(dlm, node)); |
396 | } | ||
397 | /* for now, return 0 */ | ||
398 | return 0; | ||
399 | } | 397 | } |
400 | 398 | ||
401 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | 399 | /* callers of the top-level api calls (dlmlock/dlmunlock) should |
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm) | |||
430 | { | 428 | { |
431 | spin_lock(&dlm->spinlock); | 429 | spin_lock(&dlm->spinlock); |
432 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); | 430 | BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); |
431 | printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", | ||
432 | dlm->name, dlm->reco.dead_node); | ||
433 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; | 433 | dlm->reco.state |= DLM_RECO_STATE_ACTIVE; |
434 | spin_unlock(&dlm->spinlock); | 434 | spin_unlock(&dlm->spinlock); |
435 | } | 435 | } |
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) | |||
440 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); | 440 | BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); |
441 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; | 441 | dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; |
442 | spin_unlock(&dlm->spinlock); | 442 | spin_unlock(&dlm->spinlock); |
443 | printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name); | ||
443 | wake_up(&dlm->reco.event); | 444 | wake_up(&dlm->reco.event); |
444 | } | 445 | } |
445 | 446 | ||
447 | static void dlm_print_recovery_master(struct dlm_ctxt *dlm) | ||
448 | { | ||
449 | printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the " | ||
450 | "dead node %u in domain %s\n", dlm->reco.new_master, | ||
451 | (dlm->node_num == dlm->reco.new_master ? "me" : "he"), | ||
452 | dlm->reco.dead_node, dlm->name); | ||
453 | } | ||
454 | |||
446 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | 455 | static int dlm_do_recovery(struct dlm_ctxt *dlm) |
447 | { | 456 | { |
448 | int status = 0; | 457 | int status = 0; |
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
505 | } | 514 | } |
506 | mlog(0, "another node will master this recovery session.\n"); | 515 | mlog(0, "another node will master this recovery session.\n"); |
507 | } | 516 | } |
508 | mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", | 517 | |
509 | dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, | 518 | dlm_print_recovery_master(dlm); |
510 | dlm->node_num, dlm->reco.dead_node); | ||
511 | 519 | ||
512 | /* it is safe to start everything back up here | 520 | /* it is safe to start everything back up here |
513 | * because all of the dead node's lock resources | 521 | * because all of the dead node's lock resources |
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
518 | return 0; | 526 | return 0; |
519 | 527 | ||
520 | master_here: | 528 | master_here: |
521 | mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " | 529 | dlm_print_recovery_master(dlm); |
522 | "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task), | ||
523 | dlm->node_num, dlm->reco.dead_node, dlm->name); | ||
524 | 530 | ||
525 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); | 531 | status = dlm_remaster_locks(dlm, dlm->reco.dead_node); |
526 | if (status < 0) { | 532 | if (status < 0) { |
527 | /* we should never hit this anymore */ | 533 | /* we should never hit this anymore */ |
528 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 534 | mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, " |
529 | "retrying.\n", status, dlm->reco.dead_node); | 535 | "retrying.\n", dlm->name, status, dlm->reco.dead_node); |
530 | /* yield a bit to allow any final network messages | 536 | /* yield a bit to allow any final network messages |
531 | * to get handled on remaining nodes */ | 537 | * to get handled on remaining nodes */ |
532 | msleep(100); | 538 | msleep(100); |
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
567 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); | 573 | BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); |
568 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; | 574 | ndata->state = DLM_RECO_NODE_DATA_REQUESTING; |
569 | 575 | ||
570 | mlog(0, "requesting lock info from node %u\n", | 576 | mlog(0, "%s: Requesting lock info from node %u\n", dlm->name, |
571 | ndata->node_num); | 577 | ndata->node_num); |
572 | 578 | ||
573 | if (ndata->node_num == dlm->node_num) { | 579 | if (ndata->node_num == dlm->node_num) { |
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
640 | spin_unlock(&dlm_reco_state_lock); | 646 | spin_unlock(&dlm_reco_state_lock); |
641 | } | 647 | } |
642 | 648 | ||
643 | mlog(0, "done requesting all lock info\n"); | 649 | mlog(0, "%s: Done requesting all lock info\n", dlm->name); |
644 | 650 | ||
645 | /* nodes should be sending reco data now | 651 | /* nodes should be sending reco data now |
646 | * just need to wait */ | 652 | * just need to wait */ |
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, | |||
802 | 808 | ||
803 | /* negative status is handled by caller */ | 809 | /* negative status is handled by caller */ |
804 | if (ret < 0) | 810 | if (ret < 0) |
805 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 811 | mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " |
806 | "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, | 812 | "to recover dead node %u\n", dlm->name, ret, |
807 | dlm->key, request_from); | 813 | request_from, dead_node); |
808 | |||
809 | // return from here, then | 814 | // return from here, then |
810 | // sleep until all received or error | 815 | // sleep until all received or error |
811 | return ret; | 816 | return ret; |
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | |||
956 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | 961 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, |
957 | sizeof(done_msg), send_to, &tmpret); | 962 | sizeof(done_msg), send_to, &tmpret); |
958 | if (ret < 0) { | 963 | if (ret < 0) { |
959 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 964 | mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u " |
960 | "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, | 965 | "to recover dead node %u\n", dlm->name, ret, send_to, |
961 | dlm->key, send_to); | 966 | dead_node); |
962 | if (!dlm_is_host_down(ret)) { | 967 | if (!dlm_is_host_down(ret)) { |
963 | BUG(); | 968 | BUG(); |
964 | } | 969 | } |
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, | |||
1127 | if (ret < 0) { | 1132 | if (ret < 0) { |
1128 | /* XXX: negative status is not handled. | 1133 | /* XXX: negative status is not handled. |
1129 | * this will end up killing this node. */ | 1134 | * this will end up killing this node. */ |
1130 | mlog(ML_ERROR, "Error %d when sending message %u (key " | 1135 | mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to " |
1131 | "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, | 1136 | "node %u (%s)\n", dlm->name, mres->lockname_len, |
1132 | dlm->key, send_to); | 1137 | mres->lockname, ret, send_to, |
1138 | (orig_flags & DLM_MRES_MIGRATION ? | ||
1139 | "migration" : "recovery")); | ||
1133 | } else { | 1140 | } else { |
1134 | /* might get an -ENOMEM back here */ | 1141 | /* might get an -ENOMEM back here */ |
1135 | ret = status; | 1142 | ret = status; |
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, | |||
1767 | dlm->name, mres->lockname_len, mres->lockname, | 1774 | dlm->name, mres->lockname_len, mres->lockname, |
1768 | from); | 1775 | from); |
1769 | spin_lock(&res->spinlock); | 1776 | spin_lock(&res->spinlock); |
1770 | dlm_lockres_set_refmap_bit(from, res); | 1777 | dlm_lockres_set_refmap_bit(dlm, res, from); |
1771 | spin_unlock(&res->spinlock); | 1778 | spin_unlock(&res->spinlock); |
1772 | added++; | 1779 | added++; |
1773 | break; | 1780 | break; |
@@ -1965,7 +1972,7 @@ skip_lvb: | |||
1965 | mlog(0, "%s:%.*s: added lock for node %u, " | 1972 | mlog(0, "%s:%.*s: added lock for node %u, " |
1966 | "setting refmap bit\n", dlm->name, | 1973 | "setting refmap bit\n", dlm->name, |
1967 | res->lockname.len, res->lockname.name, ml->node); | 1974 | res->lockname.len, res->lockname.name, ml->node); |
1968 | dlm_lockres_set_refmap_bit(ml->node, res); | 1975 | dlm_lockres_set_refmap_bit(dlm, res, ml->node); |
1969 | added++; | 1976 | added++; |
1970 | } | 1977 | } |
1971 | spin_unlock(&res->spinlock); | 1978 | spin_unlock(&res->spinlock); |
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
2084 | 2091 | ||
2085 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { | 2092 | list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { |
2086 | if (res->owner == dead_node) { | 2093 | if (res->owner == dead_node) { |
2094 | mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", | ||
2095 | dlm->name, res->lockname.len, res->lockname.name, | ||
2096 | res->owner, new_master); | ||
2087 | list_del_init(&res->recovering); | 2097 | list_del_init(&res->recovering); |
2088 | spin_lock(&res->spinlock); | 2098 | spin_lock(&res->spinlock); |
2089 | /* new_master has our reference from | 2099 | /* new_master has our reference from |
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, | |||
2105 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { | 2115 | for (i = 0; i < DLM_HASH_BUCKETS; i++) { |
2106 | bucket = dlm_lockres_hash(dlm, i); | 2116 | bucket = dlm_lockres_hash(dlm, i); |
2107 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { | 2117 | hlist_for_each_entry(res, hash_iter, bucket, hash_node) { |
2108 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 2118 | if (!(res->state & DLM_LOCK_RES_RECOVERING)) |
2109 | if (res->owner == dead_node) { | 2119 | continue; |
2110 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
2111 | "was not on recovering list, but " | ||
2112 | "clearing state anyway\n", | ||
2113 | dlm->node_num, res->lockname.len, | ||
2114 | res->lockname.name, new_master); | ||
2115 | } else if (res->owner == dlm->node_num) { | ||
2116 | mlog(0, "(this=%u) res %.*s owner=%u " | ||
2117 | "was not on recovering list, " | ||
2118 | "owner is THIS node, clearing\n", | ||
2119 | dlm->node_num, res->lockname.len, | ||
2120 | res->lockname.name, new_master); | ||
2121 | } else | ||
2122 | continue; | ||
2123 | 2120 | ||
2124 | if (!list_empty(&res->recovering)) { | 2121 | if (res->owner != dead_node && |
2125 | mlog(0, "%s:%.*s: lockres was " | 2122 | res->owner != dlm->node_num) |
2126 | "marked RECOVERING, owner=%u\n", | 2123 | continue; |
2127 | dlm->name, res->lockname.len, | 2124 | |
2128 | res->lockname.name, res->owner); | 2125 | if (!list_empty(&res->recovering)) { |
2129 | list_del_init(&res->recovering); | 2126 | list_del_init(&res->recovering); |
2130 | dlm_lockres_put(res); | 2127 | dlm_lockres_put(res); |
2131 | } | ||
2132 | spin_lock(&res->spinlock); | ||
2133 | /* new_master has our reference from | ||
2134 | * the lock state sent during recovery */ | ||
2135 | dlm_change_lockres_owner(dlm, res, new_master); | ||
2136 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
2137 | if (__dlm_lockres_has_locks(res)) | ||
2138 | __dlm_dirty_lockres(dlm, res); | ||
2139 | spin_unlock(&res->spinlock); | ||
2140 | wake_up(&res->wq); | ||
2141 | } | 2128 | } |
2129 | |||
2130 | /* new_master has our reference from | ||
2131 | * the lock state sent during recovery */ | ||
2132 | mlog(0, "%s: res %.*s, Changing owner from %u to %u\n", | ||
2133 | dlm->name, res->lockname.len, res->lockname.name, | ||
2134 | res->owner, new_master); | ||
2135 | spin_lock(&res->spinlock); | ||
2136 | dlm_change_lockres_owner(dlm, res, new_master); | ||
2137 | res->state &= ~DLM_LOCK_RES_RECOVERING; | ||
2138 | if (__dlm_lockres_has_locks(res)) | ||
2139 | __dlm_dirty_lockres(dlm, res); | ||
2140 | spin_unlock(&res->spinlock); | ||
2141 | wake_up(&res->wq); | ||
2142 | } | 2142 | } |
2143 | } | 2143 | } |
2144 | } | 2144 | } |
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, | |||
2252 | res->lockname.len, res->lockname.name, freed, dead_node); | 2252 | res->lockname.len, res->lockname.name, freed, dead_node); |
2253 | __dlm_print_one_lock_resource(res); | 2253 | __dlm_print_one_lock_resource(res); |
2254 | } | 2254 | } |
2255 | dlm_lockres_clear_refmap_bit(dead_node, res); | 2255 | dlm_lockres_clear_refmap_bit(dlm, res, dead_node); |
2256 | } else if (test_bit(dead_node, res->refmap)) { | 2256 | } else if (test_bit(dead_node, res->refmap)) { |
2257 | mlog(0, "%s:%.*s: dead node %u had a ref, but had " | 2257 | mlog(0, "%s:%.*s: dead node %u had a ref, but had " |
2258 | "no locks and had not purged before dying\n", dlm->name, | 2258 | "no locks and had not purged before dying\n", dlm->name, |
2259 | res->lockname.len, res->lockname.name, dead_node); | 2259 | res->lockname.len, res->lockname.name, dead_node); |
2260 | dlm_lockres_clear_refmap_bit(dead_node, res); | 2260 | dlm_lockres_clear_refmap_bit(dlm, res, dead_node); |
2261 | } | 2261 | } |
2262 | 2262 | ||
2263 | /* do not kick thread yet */ | 2263 | /* do not kick thread yet */ |
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | |||
2324 | dlm_revalidate_lvb(dlm, res, dead_node); | 2324 | dlm_revalidate_lvb(dlm, res, dead_node); |
2325 | if (res->owner == dead_node) { | 2325 | if (res->owner == dead_node) { |
2326 | if (res->state & DLM_LOCK_RES_DROPPING_REF) { | 2326 | if (res->state & DLM_LOCK_RES_DROPPING_REF) { |
2327 | mlog(ML_NOTICE, "Ignore %.*s for " | 2327 | mlog(ML_NOTICE, "%s: res %.*s, Skip " |
2328 | "recovery as it is being freed\n", | 2328 | "recovery as it is being freed\n", |
2329 | res->lockname.len, | 2329 | dlm->name, res->lockname.len, |
2330 | res->lockname.name); | 2330 | res->lockname.name); |
2331 | } else | 2331 | } else |
2332 | dlm_move_lockres_to_recovery_list(dlm, | 2332 | dlm_move_lockres_to_recovery_list(dlm, |
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 1d6d1d22c47..e73c833fc2a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c | |||
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) | |||
94 | { | 94 | { |
95 | int bit; | 95 | int bit; |
96 | 96 | ||
97 | assert_spin_locked(&res->spinlock); | ||
98 | |||
97 | if (__dlm_lockres_has_locks(res)) | 99 | if (__dlm_lockres_has_locks(res)) |
98 | return 0; | 100 | return 0; |
99 | 101 | ||
102 | /* Locks are in the process of being created */ | ||
103 | if (res->inflight_locks) | ||
104 | return 0; | ||
105 | |||
100 | if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) | 106 | if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) |
101 | return 0; | 107 | return 0; |
102 | 108 | ||
103 | if (res->state & DLM_LOCK_RES_RECOVERING) | 109 | if (res->state & DLM_LOCK_RES_RECOVERING) |
104 | return 0; | 110 | return 0; |
105 | 111 | ||
112 | /* Another node has this resource with this node as the master */ | ||
106 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); | 113 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); |
107 | if (bit < O2NM_MAX_NODES) | 114 | if (bit < O2NM_MAX_NODES) |
108 | return 0; | 115 | return 0; |
109 | 116 | ||
110 | /* | ||
111 | * since the bit for dlm->node_num is not set, inflight_locks better | ||
112 | * be zero | ||
113 | */ | ||
114 | BUG_ON(res->inflight_locks != 0); | ||
115 | return 1; | 117 | return 1; |
116 | } | 118 | } |
117 | 119 | ||
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, | |||
185 | /* clear our bit from the master's refmap, ignore errors */ | 187 | /* clear our bit from the master's refmap, ignore errors */ |
186 | ret = dlm_drop_lockres_ref(dlm, res); | 188 | ret = dlm_drop_lockres_ref(dlm, res); |
187 | if (ret < 0) { | 189 | if (ret < 0) { |
188 | mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name, | ||
189 | res->lockname.len, res->lockname.name, ret); | ||
190 | if (!dlm_is_host_down(ret)) | 190 | if (!dlm_is_host_down(ret)) |
191 | BUG(); | 191 | BUG(); |
192 | } | 192 | } |
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm, | |||
209 | BUG(); | 209 | BUG(); |
210 | } | 210 | } |
211 | 211 | ||
212 | __dlm_unhash_lockres(res); | 212 | __dlm_unhash_lockres(dlm, res); |
213 | 213 | ||
214 | /* lockres is not in the hash now. drop the flag and wake up | 214 | /* lockres is not in the hash now. drop the flag and wake up |
215 | * any processes waiting in dlm_get_lock_resource. */ | 215 | * any processes waiting in dlm_get_lock_resource. */ |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e1ed5e502ff..81a4cd22f80 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode) | |||
1692 | mlog(0, "inode %llu take PRMODE open lock\n", | 1692 | mlog(0, "inode %llu take PRMODE open lock\n", |
1693 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 1693 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
1694 | 1694 | ||
1695 | if (ocfs2_mount_local(osb)) | 1695 | if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) |
1696 | goto out; | 1696 | goto out; |
1697 | 1697 | ||
1698 | lockres = &OCFS2_I(inode)->ip_open_lockres; | 1698 | lockres = &OCFS2_I(inode)->ip_open_lockres; |
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write) | |||
1718 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1718 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1719 | write ? "EXMODE" : "PRMODE"); | 1719 | write ? "EXMODE" : "PRMODE"); |
1720 | 1720 | ||
1721 | if (ocfs2_is_hard_readonly(osb)) { | ||
1722 | if (write) | ||
1723 | status = -EROFS; | ||
1724 | goto out; | ||
1725 | } | ||
1726 | |||
1721 | if (ocfs2_mount_local(osb)) | 1727 | if (ocfs2_mount_local(osb)) |
1722 | goto out; | 1728 | goto out; |
1723 | 1729 | ||
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, | |||
2298 | if (ocfs2_is_hard_readonly(osb)) { | 2304 | if (ocfs2_is_hard_readonly(osb)) { |
2299 | if (ex) | 2305 | if (ex) |
2300 | status = -EROFS; | 2306 | status = -EROFS; |
2301 | goto bail; | 2307 | goto getbh; |
2302 | } | 2308 | } |
2303 | 2309 | ||
2304 | if (ocfs2_mount_local(osb)) | 2310 | if (ocfs2_mount_local(osb)) |
@@ -2356,7 +2362,7 @@ local: | |||
2356 | mlog_errno(status); | 2362 | mlog_errno(status); |
2357 | goto bail; | 2363 | goto bail; |
2358 | } | 2364 | } |
2359 | 2365 | getbh: | |
2360 | if (ret_bh) { | 2366 | if (ret_bh) { |
2361 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); | 2367 | status = ocfs2_assign_bh(inode, ret_bh, local_bh); |
2362 | if (status < 0) { | 2368 | if (status < 0) { |
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) | |||
2628 | 2634 | ||
2629 | BUG_ON(!dl); | 2635 | BUG_ON(!dl); |
2630 | 2636 | ||
2631 | if (ocfs2_is_hard_readonly(osb)) | 2637 | if (ocfs2_is_hard_readonly(osb)) { |
2632 | return -EROFS; | 2638 | if (ex) |
2639 | return -EROFS; | ||
2640 | return 0; | ||
2641 | } | ||
2633 | 2642 | ||
2634 | if (ocfs2_mount_local(osb)) | 2643 | if (ocfs2_mount_local(osb)) |
2635 | return 0; | 2644 | return 0; |
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex) | |||
2647 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; | 2656 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; |
2648 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); | 2657 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); |
2649 | 2658 | ||
2650 | if (!ocfs2_mount_local(osb)) | 2659 | if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) |
2651 | ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); | 2660 | ocfs2_cluster_unlock(osb, &dl->dl_lockres, level); |
2652 | } | 2661 | } |
2653 | 2662 | ||
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 23457b491e8..2f5b92ef0e5 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -832,6 +832,102 @@ out: | |||
832 | return ret; | 832 | return ret; |
833 | } | 833 | } |
834 | 834 | ||
835 | int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin) | ||
836 | { | ||
837 | struct inode *inode = file->f_mapping->host; | ||
838 | int ret; | ||
839 | unsigned int is_last = 0, is_data = 0; | ||
840 | u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
841 | u32 cpos, cend, clen, hole_size; | ||
842 | u64 extoff, extlen; | ||
843 | struct buffer_head *di_bh = NULL; | ||
844 | struct ocfs2_extent_rec rec; | ||
845 | |||
846 | BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE); | ||
847 | |||
848 | ret = ocfs2_inode_lock(inode, &di_bh, 0); | ||
849 | if (ret) { | ||
850 | mlog_errno(ret); | ||
851 | goto out; | ||
852 | } | ||
853 | |||
854 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
855 | |||
856 | if (*offset >= inode->i_size) { | ||
857 | ret = -ENXIO; | ||
858 | goto out_unlock; | ||
859 | } | ||
860 | |||
861 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | ||
862 | if (origin == SEEK_HOLE) | ||
863 | *offset = inode->i_size; | ||
864 | goto out_unlock; | ||
865 | } | ||
866 | |||
867 | clen = 0; | ||
868 | cpos = *offset >> cs_bits; | ||
869 | cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); | ||
870 | |||
871 | while (cpos < cend && !is_last) { | ||
872 | ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, | ||
873 | &rec, &is_last); | ||
874 | if (ret) { | ||
875 | mlog_errno(ret); | ||
876 | goto out_unlock; | ||
877 | } | ||
878 | |||
879 | extoff = cpos; | ||
880 | extoff <<= cs_bits; | ||
881 | |||
882 | if (rec.e_blkno == 0ULL) { | ||
883 | clen = hole_size; | ||
884 | is_data = 0; | ||
885 | } else { | ||
886 | clen = le16_to_cpu(rec.e_leaf_clusters) - | ||
887 | (cpos - le32_to_cpu(rec.e_cpos)); | ||
888 | is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1; | ||
889 | } | ||
890 | |||
891 | if ((!is_data && origin == SEEK_HOLE) || | ||
892 | (is_data && origin == SEEK_DATA)) { | ||
893 | if (extoff > *offset) | ||
894 | *offset = extoff; | ||
895 | goto out_unlock; | ||
896 | } | ||
897 | |||
898 | if (!is_last) | ||
899 | cpos += clen; | ||
900 | } | ||
901 | |||
902 | if (origin == SEEK_HOLE) { | ||
903 | extoff = cpos; | ||
904 | extoff <<= cs_bits; | ||
905 | extlen = clen; | ||
906 | extlen <<= cs_bits; | ||
907 | |||
908 | if ((extoff + extlen) > inode->i_size) | ||
909 | extlen = inode->i_size - extoff; | ||
910 | extoff += extlen; | ||
911 | if (extoff > *offset) | ||
912 | *offset = extoff; | ||
913 | goto out_unlock; | ||
914 | } | ||
915 | |||
916 | ret = -ENXIO; | ||
917 | |||
918 | out_unlock: | ||
919 | |||
920 | brelse(di_bh); | ||
921 | |||
922 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
923 | |||
924 | ocfs2_inode_unlock(inode, 0); | ||
925 | out: | ||
926 | if (ret && ret != -ENXIO) | ||
927 | ret = -ENXIO; | ||
928 | return ret; | ||
929 | } | ||
930 | |||
835 | int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, | 931 | int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, |
836 | struct buffer_head *bhs[], int flags, | 932 | struct buffer_head *bhs[], int flags, |
837 | int (*validate)(struct super_block *sb, | 933 | int (*validate)(struct super_block *sb, |
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e79d41c2c90..67ea57d2fd5 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h | |||
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, | |||
53 | int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, | 53 | int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
54 | u64 map_start, u64 map_len); | 54 | u64 map_start, u64 map_len); |
55 | 55 | ||
56 | int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); | ||
57 | |||
56 | int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, | 58 | int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, |
57 | u32 *p_cluster, u32 *num_clusters, | 59 | u32 *p_cluster, u32 *num_clusters, |
58 | struct ocfs2_extent_list *el, | 60 | struct ocfs2_extent_list *el, |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index de4ea1af041..6e396683c3d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, | |||
1950 | if (ret < 0) | 1950 | if (ret < 0) |
1951 | mlog_errno(ret); | 1951 | mlog_errno(ret); |
1952 | 1952 | ||
1953 | if (file->f_flags & O_SYNC) | ||
1954 | handle->h_sync = 1; | ||
1955 | |||
1953 | ocfs2_commit_trans(osb, handle); | 1956 | ocfs2_commit_trans(osb, handle); |
1954 | 1957 | ||
1955 | out_inode_unlock: | 1958 | out_inode_unlock: |
@@ -2052,6 +2055,23 @@ out: | |||
2052 | return ret; | 2055 | return ret; |
2053 | } | 2056 | } |
2054 | 2057 | ||
2058 | static void ocfs2_aiodio_wait(struct inode *inode) | ||
2059 | { | ||
2060 | wait_queue_head_t *wq = ocfs2_ioend_wq(inode); | ||
2061 | |||
2062 | wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0)); | ||
2063 | } | ||
2064 | |||
2065 | static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) | ||
2066 | { | ||
2067 | int blockmask = inode->i_sb->s_blocksize - 1; | ||
2068 | loff_t final_size = pos + count; | ||
2069 | |||
2070 | if ((pos & blockmask) || (final_size & blockmask)) | ||
2071 | return 1; | ||
2072 | return 0; | ||
2073 | } | ||
2074 | |||
2055 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, | 2075 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, |
2056 | struct file *file, | 2076 | struct file *file, |
2057 | loff_t pos, size_t count, | 2077 | loff_t pos, size_t count, |
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
2230 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2250 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2231 | int full_coherency = !(osb->s_mount_opt & | 2251 | int full_coherency = !(osb->s_mount_opt & |
2232 | OCFS2_MOUNT_COHERENCY_BUFFERED); | 2252 | OCFS2_MOUNT_COHERENCY_BUFFERED); |
2253 | int unaligned_dio = 0; | ||
2233 | 2254 | ||
2234 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, | 2255 | trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, |
2235 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 2256 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
@@ -2297,6 +2318,10 @@ relock: | |||
2297 | goto out; | 2318 | goto out; |
2298 | } | 2319 | } |
2299 | 2320 | ||
2321 | if (direct_io && !is_sync_kiocb(iocb)) | ||
2322 | unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, | ||
2323 | *ppos); | ||
2324 | |||
2300 | /* | 2325 | /* |
2301 | * We can't complete the direct I/O as requested, fall back to | 2326 | * We can't complete the direct I/O as requested, fall back to |
2302 | * buffered I/O. | 2327 | * buffered I/O. |
@@ -2311,6 +2336,18 @@ relock: | |||
2311 | goto relock; | 2336 | goto relock; |
2312 | } | 2337 | } |
2313 | 2338 | ||
2339 | if (unaligned_dio) { | ||
2340 | /* | ||
2341 | * Wait on previous unaligned aio to complete before | ||
2342 | * proceeding. | ||
2343 | */ | ||
2344 | ocfs2_aiodio_wait(inode); | ||
2345 | |||
2346 | /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */ | ||
2347 | atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio); | ||
2348 | ocfs2_iocb_set_unaligned_aio(iocb); | ||
2349 | } | ||
2350 | |||
2314 | /* | 2351 | /* |
2315 | * To later detect whether a journal commit for sync writes is | 2352 | * To later detect whether a journal commit for sync writes is |
2316 | * necessary, we sample i_size, and cluster count here. | 2353 | * necessary, we sample i_size, and cluster count here. |
@@ -2382,8 +2419,12 @@ out_dio: | |||
2382 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { | 2419 | if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { |
2383 | rw_level = -1; | 2420 | rw_level = -1; |
2384 | have_alloc_sem = 0; | 2421 | have_alloc_sem = 0; |
2422 | unaligned_dio = 0; | ||
2385 | } | 2423 | } |
2386 | 2424 | ||
2425 | if (unaligned_dio) | ||
2426 | atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio); | ||
2427 | |||
2387 | out: | 2428 | out: |
2388 | if (rw_level != -1) | 2429 | if (rw_level != -1) |
2389 | ocfs2_rw_unlock(inode, rw_level); | 2430 | ocfs2_rw_unlock(inode, rw_level); |
@@ -2591,6 +2632,57 @@ bail: | |||
2591 | return ret; | 2632 | return ret; |
2592 | } | 2633 | } |
2593 | 2634 | ||
2635 | /* Refer generic_file_llseek_unlocked() */ | ||
2636 | static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin) | ||
2637 | { | ||
2638 | struct inode *inode = file->f_mapping->host; | ||
2639 | int ret = 0; | ||
2640 | |||
2641 | mutex_lock(&inode->i_mutex); | ||
2642 | |||
2643 | switch (origin) { | ||
2644 | case SEEK_SET: | ||
2645 | break; | ||
2646 | case SEEK_END: | ||
2647 | offset += inode->i_size; | ||
2648 | break; | ||
2649 | case SEEK_CUR: | ||
2650 | if (offset == 0) { | ||
2651 | offset = file->f_pos; | ||
2652 | goto out; | ||
2653 | } | ||
2654 | offset += file->f_pos; | ||
2655 | break; | ||
2656 | case SEEK_DATA: | ||
2657 | case SEEK_HOLE: | ||
2658 | ret = ocfs2_seek_data_hole_offset(file, &offset, origin); | ||
2659 | if (ret) | ||
2660 | goto out; | ||
2661 | break; | ||
2662 | default: | ||
2663 | ret = -EINVAL; | ||
2664 | goto out; | ||
2665 | } | ||
2666 | |||
2667 | if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) | ||
2668 | ret = -EINVAL; | ||
2669 | if (!ret && offset > inode->i_sb->s_maxbytes) | ||
2670 | ret = -EINVAL; | ||
2671 | if (ret) | ||
2672 | goto out; | ||
2673 | |||
2674 | if (offset != file->f_pos) { | ||
2675 | file->f_pos = offset; | ||
2676 | file->f_version = 0; | ||
2677 | } | ||
2678 | |||
2679 | out: | ||
2680 | mutex_unlock(&inode->i_mutex); | ||
2681 | if (ret) | ||
2682 | return ret; | ||
2683 | return offset; | ||
2684 | } | ||
2685 | |||
2594 | const struct inode_operations ocfs2_file_iops = { | 2686 | const struct inode_operations ocfs2_file_iops = { |
2595 | .setattr = ocfs2_setattr, | 2687 | .setattr = ocfs2_setattr, |
2596 | .getattr = ocfs2_getattr, | 2688 | .getattr = ocfs2_getattr, |
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = { | |||
2615 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! | 2707 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! |
2616 | */ | 2708 | */ |
2617 | const struct file_operations ocfs2_fops = { | 2709 | const struct file_operations ocfs2_fops = { |
2618 | .llseek = generic_file_llseek, | 2710 | .llseek = ocfs2_file_llseek, |
2619 | .read = do_sync_read, | 2711 | .read = do_sync_read, |
2620 | .write = do_sync_write, | 2712 | .write = do_sync_write, |
2621 | .mmap = ocfs2_mmap, | 2713 | .mmap = ocfs2_mmap, |
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = { | |||
2663 | * the cluster. | 2755 | * the cluster. |
2664 | */ | 2756 | */ |
2665 | const struct file_operations ocfs2_fops_no_plocks = { | 2757 | const struct file_operations ocfs2_fops_no_plocks = { |
2666 | .llseek = generic_file_llseek, | 2758 | .llseek = ocfs2_file_llseek, |
2667 | .read = do_sync_read, | 2759 | .read = do_sync_read, |
2668 | .write = do_sync_write, | 2760 | .write = do_sync_write, |
2669 | .mmap = ocfs2_mmap, | 2761 | .mmap = ocfs2_mmap, |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index a22d2c09889..17454a904d7 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode, | |||
951 | trace_ocfs2_cleanup_delete_inode( | 951 | trace_ocfs2_cleanup_delete_inode( |
952 | (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); | 952 | (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); |
953 | if (sync_data) | 953 | if (sync_data) |
954 | write_inode_now(inode, 1); | 954 | filemap_write_and_wait(inode->i_mapping); |
955 | truncate_inode_pages(&inode->i_data, 0); | 955 | truncate_inode_pages(&inode->i_data, 0); |
956 | } | 956 | } |
957 | 957 | ||
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1c508b149b3..88924a3133f 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -43,6 +43,9 @@ struct ocfs2_inode_info | |||
43 | /* protects extended attribute changes on this inode */ | 43 | /* protects extended attribute changes on this inode */ |
44 | struct rw_semaphore ip_xattr_sem; | 44 | struct rw_semaphore ip_xattr_sem; |
45 | 45 | ||
46 | /* Number of outstanding AIO's which are not page aligned */ | ||
47 | atomic_t ip_unaligned_aio; | ||
48 | |||
46 | /* These fields are protected by ip_lock */ | 49 | /* These fields are protected by ip_lock */ |
47 | spinlock_t ip_lock; | 50 | spinlock_t ip_lock; |
48 | u32 ip_open_count; | 51 | u32 ip_open_count; |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index bc91072b721..726ff265b29 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, | |||
122 | if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & | 122 | if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) & |
123 | (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { | 123 | (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) { |
124 | if (!capable(CAP_LINUX_IMMUTABLE)) | 124 | if (!capable(CAP_LINUX_IMMUTABLE)) |
125 | goto bail_unlock; | 125 | goto bail_commit; |
126 | } | 126 | } |
127 | 127 | ||
128 | ocfs2_inode->ip_attr = flags; | 128 | ocfs2_inode->ip_attr = flags; |
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, | |||
132 | if (status < 0) | 132 | if (status < 0) |
133 | mlog_errno(status); | 133 | mlog_errno(status); |
134 | 134 | ||
135 | bail_commit: | ||
135 | ocfs2_commit_trans(osb, handle); | 136 | ocfs2_commit_trans(osb, handle); |
136 | bail_unlock: | 137 | bail_unlock: |
137 | ocfs2_inode_unlock(inode, 1); | 138 | ocfs2_inode_unlock(inode, 1); |
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode, | |||
381 | if (!oifi) { | 382 | if (!oifi) { |
382 | status = -ENOMEM; | 383 | status = -ENOMEM; |
383 | mlog_errno(status); | 384 | mlog_errno(status); |
384 | goto bail; | 385 | goto out_err; |
385 | } | 386 | } |
386 | 387 | ||
387 | if (o2info_from_user(*oifi, req)) | 388 | if (o2info_from_user(*oifi, req)) |
@@ -431,7 +432,7 @@ bail: | |||
431 | o2info_set_request_error(&oifi->ifi_req, req); | 432 | o2info_set_request_error(&oifi->ifi_req, req); |
432 | 433 | ||
433 | kfree(oifi); | 434 | kfree(oifi); |
434 | 435 | out_err: | |
435 | return status; | 436 | return status; |
436 | } | 437 | } |
437 | 438 | ||
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode, | |||
666 | if (!oiff) { | 667 | if (!oiff) { |
667 | status = -ENOMEM; | 668 | status = -ENOMEM; |
668 | mlog_errno(status); | 669 | mlog_errno(status); |
669 | goto bail; | 670 | goto out_err; |
670 | } | 671 | } |
671 | 672 | ||
672 | if (o2info_from_user(*oiff, req)) | 673 | if (o2info_from_user(*oiff, req)) |
@@ -716,7 +717,7 @@ bail: | |||
716 | o2info_set_request_error(&oiff->iff_req, req); | 717 | o2info_set_request_error(&oiff->iff_req, req); |
717 | 718 | ||
718 | kfree(oiff); | 719 | kfree(oiff); |
719 | 720 | out_err: | |
720 | return status; | 721 | return status; |
721 | } | 722 | } |
722 | 723 | ||
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 295d56454e8..0a42ae96dca 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1544 | /* we need to run complete recovery for offline orphan slots */ | 1544 | /* we need to run complete recovery for offline orphan slots */ |
1545 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); | 1545 | ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); |
1546 | 1546 | ||
1547 | mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", | 1547 | printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\ |
1548 | node_num, slot_num, | 1548 | "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), |
1549 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 1549 | MINOR(osb->sb->s_dev)); |
1550 | 1550 | ||
1551 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | 1551 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); |
1552 | 1552 | ||
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, | |||
1601 | 1601 | ||
1602 | jbd2_journal_destroy(journal); | 1602 | jbd2_journal_destroy(journal); |
1603 | 1603 | ||
1604 | printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\ | ||
1605 | "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev), | ||
1606 | MINOR(osb->sb->s_dev)); | ||
1604 | done: | 1607 | done: |
1605 | /* drop the lock on this nodes journal */ | 1608 | /* drop the lock on this nodes journal */ |
1606 | if (got_lock) | 1609 | if (got_lock) |
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) | |||
1808 | * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This | 1811 | * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This |
1809 | * is done to catch any orphans that are left over in orphan directories. | 1812 | * is done to catch any orphans that are left over in orphan directories. |
1810 | * | 1813 | * |
1814 | * It scans all slots, even ones that are in use. It does so to handle the | ||
1815 | * case described below: | ||
1816 | * | ||
1817 | * Node 1 has an inode it was using. The dentry went away due to memory | ||
1818 | * pressure. Node 1 closes the inode, but it's on the free list. The node | ||
1819 | * has the open lock. | ||
1820 | * Node 2 unlinks the inode. It grabs the dentry lock to notify others, | ||
1821 | * but node 1 has no dentry and doesn't get the message. It trylocks the | ||
1822 | * open lock, sees that another node has a PR, and does nothing. | ||
1823 | * Later node 2 runs its orphan dir. It igets the inode, trylocks the | ||
1824 | * open lock, sees the PR still, and does nothing. | ||
1825 | * Basically, we have to trigger an orphan iput on node 1. The only way | ||
1826 | * for this to happen is if node 1 runs node 2's orphan dir. | ||
1827 | * | ||
1811 | * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT | 1828 | * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT |
1812 | * seconds. It gets an EX lock on os_lockres and checks sequence number | 1829 | * seconds. It gets an EX lock on os_lockres and checks sequence number |
1813 | * stored in LVB. If the sequence number has changed, it means some other | 1830 | * stored in LVB. If the sequence number has changed, it means some other |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 68cf2f6d3c6..a3385b63ff5 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, | |||
441 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) | 441 | #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) |
442 | 442 | ||
443 | /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota | 443 | /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota |
444 | * update on dir + index leaf + dx root update for free list */ | 444 | * update on dir + index leaf + dx root update for free list + |
445 | * previous dirblock update in the free list */ | ||
445 | static inline int ocfs2_link_credits(struct super_block *sb) | 446 | static inline int ocfs2_link_credits(struct super_block *sb) |
446 | { | 447 | { |
447 | return 2*OCFS2_INODE_UPDATE_CREDITS + 3 + | 448 | return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + |
448 | ocfs2_quota_trans_credits(sb); | 449 | ocfs2_quota_trans_credits(sb); |
449 | } | 450 | } |
450 | 451 | ||
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3e9393ca39e..9cd41083e99 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c | |||
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
61 | static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | 61 | static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, |
62 | struct page *page) | 62 | struct page *page) |
63 | { | 63 | { |
64 | int ret; | 64 | int ret = VM_FAULT_NOPAGE; |
65 | struct inode *inode = file->f_path.dentry->d_inode; | 65 | struct inode *inode = file->f_path.dentry->d_inode; |
66 | struct address_space *mapping = inode->i_mapping; | 66 | struct address_space *mapping = inode->i_mapping; |
67 | loff_t pos = page_offset(page); | 67 | loff_t pos = page_offset(page); |
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
71 | void *fsdata; | 71 | void *fsdata; |
72 | loff_t size = i_size_read(inode); | 72 | loff_t size = i_size_read(inode); |
73 | 73 | ||
74 | /* | ||
75 | * Another node might have truncated while we were waiting on | ||
76 | * cluster locks. | ||
77 | * We don't check size == 0 before the shift. This is borrowed | ||
78 | * from do_generic_file_read. | ||
79 | */ | ||
80 | last_index = (size - 1) >> PAGE_CACHE_SHIFT; | 74 | last_index = (size - 1) >> PAGE_CACHE_SHIFT; |
81 | if (unlikely(!size || page->index > last_index)) { | ||
82 | ret = -EINVAL; | ||
83 | goto out; | ||
84 | } | ||
85 | 75 | ||
86 | /* | 76 | /* |
87 | * The i_size check above doesn't catch the case where nodes | 77 | * There are cases that lead to the page no longer bebongs to the |
88 | * truncated and then re-extended the file. We'll re-check the | 78 | * mapping. |
89 | * page mapping after taking the page lock inside of | 79 | * 1) pagecache truncates locally due to memory pressure. |
90 | * ocfs2_write_begin_nolock(). | 80 | * 2) pagecache truncates when another is taking EX lock against |
81 | * inode lock. see ocfs2_data_convert_worker. | ||
82 | * | ||
83 | * The i_size check doesn't catch the case where nodes truncated and | ||
84 | * then re-extended the file. We'll re-check the page mapping after | ||
85 | * taking the page lock inside of ocfs2_write_begin_nolock(). | ||
86 | * | ||
87 | * Let VM retry with these cases. | ||
91 | */ | 88 | */ |
92 | if (!PageUptodate(page) || page->mapping != inode->i_mapping) { | 89 | if ((page->mapping != inode->i_mapping) || |
93 | /* | 90 | (!PageUptodate(page)) || |
94 | * the page has been umapped in ocfs2_data_downconvert_worker. | 91 | (page_offset(page) >= size)) |
95 | * So return 0 here and let VFS retry. | ||
96 | */ | ||
97 | ret = 0; | ||
98 | goto out; | 92 | goto out; |
99 | } | ||
100 | 93 | ||
101 | /* | 94 | /* |
102 | * Call ocfs2_write_begin() and ocfs2_write_end() to take | 95 | * Call ocfs2_write_begin() and ocfs2_write_end() to take |
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, | |||
116 | if (ret) { | 109 | if (ret) { |
117 | if (ret != -ENOSPC) | 110 | if (ret != -ENOSPC) |
118 | mlog_errno(ret); | 111 | mlog_errno(ret); |
112 | if (ret == -ENOMEM) | ||
113 | ret = VM_FAULT_OOM; | ||
114 | else | ||
115 | ret = VM_FAULT_SIGBUS; | ||
119 | goto out; | 116 | goto out; |
120 | } | 117 | } |
121 | 118 | ||
122 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | 119 | if (!locked_page) { |
123 | fsdata); | 120 | ret = VM_FAULT_NOPAGE; |
124 | if (ret < 0) { | ||
125 | mlog_errno(ret); | ||
126 | goto out; | 121 | goto out; |
127 | } | 122 | } |
123 | ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | ||
124 | fsdata); | ||
128 | BUG_ON(ret != len); | 125 | BUG_ON(ret != len); |
129 | ret = 0; | 126 | ret = VM_FAULT_LOCKED; |
130 | out: | 127 | out: |
131 | return ret; | 128 | return ret; |
132 | } | 129 | } |
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
168 | 165 | ||
169 | out: | 166 | out: |
170 | ocfs2_unblock_signals(&oldset); | 167 | ocfs2_unblock_signals(&oldset); |
171 | if (ret) | ||
172 | ret = VM_FAULT_SIGBUS; | ||
173 | return ret; | 168 | return ret; |
174 | } | 169 | } |
175 | 170 | ||
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index d53cb706f14..184c76b8c29 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, | |||
745 | */ | 745 | */ |
746 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, | 746 | ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, |
747 | new_phys_cpos); | 747 | new_phys_cpos); |
748 | if (!new_phys_cpos) { | 748 | if (!*new_phys_cpos) { |
749 | ret = -ENOSPC; | 749 | ret = -ENOSPC; |
750 | goto out_commit; | 750 | goto out_commit; |
751 | } | 751 | } |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 409285854f6..d355e6e36b3 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb, | |||
836 | 836 | ||
837 | static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) | 837 | static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) |
838 | { | 838 | { |
839 | __test_and_set_bit_le(bit, bitmap); | 839 | __set_bit_le(bit, bitmap); |
840 | } | 840 | } |
841 | #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) | 841 | #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) |
842 | 842 | ||
843 | static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) | 843 | static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) |
844 | { | 844 | { |
845 | __test_and_clear_bit_le(bit, bitmap); | 845 | __clear_bit_le(bit, bitmap); |
846 | } | 846 | } |
847 | #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) | 847 | #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) |
848 | 848 | ||
849 | #define ocfs2_test_bit test_bit_le | 849 | #define ocfs2_test_bit test_bit_le |
850 | #define ocfs2_find_next_zero_bit find_next_zero_bit_le | 850 | #define ocfs2_find_next_zero_bit find_next_zero_bit_le |
851 | #define ocfs2_find_next_bit find_next_bit_le | 851 | #define ocfs2_find_next_bit find_next_bit_le |
852 | |||
853 | static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr) | ||
854 | { | ||
855 | #if BITS_PER_LONG == 64 | ||
856 | *bit += ((unsigned long) addr & 7UL) << 3; | ||
857 | addr = (void *) ((unsigned long) addr & ~7UL); | ||
858 | #elif BITS_PER_LONG == 32 | ||
859 | *bit += ((unsigned long) addr & 3UL) << 3; | ||
860 | addr = (void *) ((unsigned long) addr & ~3UL); | ||
861 | #else | ||
862 | #error "how many bits you are?!" | ||
863 | #endif | ||
864 | return addr; | ||
865 | } | ||
866 | |||
867 | static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap) | ||
868 | { | ||
869 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
870 | ocfs2_set_bit(bit, bitmap); | ||
871 | } | ||
872 | |||
873 | static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap) | ||
874 | { | ||
875 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
876 | ocfs2_clear_bit(bit, bitmap); | ||
877 | } | ||
878 | |||
879 | static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap) | ||
880 | { | ||
881 | bitmap = correct_addr_and_bit_unaligned(&bit, bitmap); | ||
882 | return ocfs2_test_bit(bit, bitmap); | ||
883 | } | ||
884 | |||
885 | static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max, | ||
886 | int start) | ||
887 | { | ||
888 | int fix = 0, ret, tmpmax; | ||
889 | bitmap = correct_addr_and_bit_unaligned(&fix, bitmap); | ||
890 | tmpmax = max + fix; | ||
891 | start += fix; | ||
892 | |||
893 | ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix; | ||
894 | if (ret > max) | ||
895 | return max; | ||
896 | return ret; | ||
897 | } | ||
898 | |||
852 | #endif /* OCFS2_H */ | 899 | #endif /* OCFS2_H */ |
853 | 900 | ||
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index dc8007fc924..f100bf70a90 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c | |||
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( | |||
404 | int status = 0; | 404 | int status = 0; |
405 | struct ocfs2_quota_recovery *rec; | 405 | struct ocfs2_quota_recovery *rec; |
406 | 406 | ||
407 | mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); | 407 | printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for " |
408 | "slot %u\n", osb->dev_str, slot_num); | ||
409 | |||
408 | rec = ocfs2_alloc_quota_recovery(); | 410 | rec = ocfs2_alloc_quota_recovery(); |
409 | if (!rec) | 411 | if (!rec) |
410 | return ERR_PTR(-ENOMEM); | 412 | return ERR_PTR(-ENOMEM); |
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, | |||
549 | goto out_commit; | 551 | goto out_commit; |
550 | } | 552 | } |
551 | lock_buffer(qbh); | 553 | lock_buffer(qbh); |
552 | WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); | 554 | WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap)); |
553 | ocfs2_clear_bit(bit, dchunk->dqc_bitmap); | 555 | ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap); |
554 | le32_add_cpu(&dchunk->dqc_free, 1); | 556 | le32_add_cpu(&dchunk->dqc_free, 1); |
555 | unlock_buffer(qbh); | 557 | unlock_buffer(qbh); |
556 | ocfs2_journal_dirty(handle, qbh); | 558 | ocfs2_journal_dirty(handle, qbh); |
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, | |||
596 | struct inode *lqinode; | 598 | struct inode *lqinode; |
597 | unsigned int flags; | 599 | unsigned int flags; |
598 | 600 | ||
599 | mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); | 601 | printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " |
602 | "slot %u\n", osb->dev_str, slot_num); | ||
603 | |||
600 | mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); | 604 | mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); |
601 | for (type = 0; type < MAXQUOTAS; type++) { | 605 | for (type = 0; type < MAXQUOTAS; type++) { |
602 | if (list_empty(&(rec->r_list[type]))) | 606 | if (list_empty(&(rec->r_list[type]))) |
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, | |||
612 | /* Someone else is holding the lock? Then he must be | 616 | /* Someone else is holding the lock? Then he must be |
613 | * doing the recovery. Just skip the file... */ | 617 | * doing the recovery. Just skip the file... */ |
614 | if (status == -EAGAIN) { | 618 | if (status == -EAGAIN) { |
615 | mlog(ML_NOTICE, "skipping quota recovery for slot %d " | 619 | printk(KERN_NOTICE "ocfs2: Skipping quota recovery on " |
616 | "because quota file is locked.\n", slot_num); | 620 | "device (%s) for slot %d because quota file is " |
621 | "locked.\n", osb->dev_str, slot_num); | ||
617 | status = 0; | 622 | status = 0; |
618 | goto out_put; | 623 | goto out_put; |
619 | } else if (status < 0) { | 624 | } else if (status < 0) { |
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, | |||
944 | * ol_quota_entries_per_block(sb); | 949 | * ol_quota_entries_per_block(sb); |
945 | } | 950 | } |
946 | 951 | ||
947 | found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); | 952 | found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0); |
948 | /* We failed? */ | 953 | /* We failed? */ |
949 | if (found == len) { | 954 | if (found == len) { |
950 | mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" | 955 | mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" |
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private) | |||
1208 | struct ocfs2_local_disk_chunk *dchunk; | 1213 | struct ocfs2_local_disk_chunk *dchunk; |
1209 | 1214 | ||
1210 | dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; | 1215 | dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; |
1211 | ocfs2_set_bit(*offset, dchunk->dqc_bitmap); | 1216 | ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap); |
1212 | le32_add_cpu(&dchunk->dqc_free, -1); | 1217 | le32_add_cpu(&dchunk->dqc_free, -1); |
1213 | } | 1218 | } |
1214 | 1219 | ||
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) | |||
1289 | (od->dq_chunk->qc_headerbh->b_data); | 1294 | (od->dq_chunk->qc_headerbh->b_data); |
1290 | /* Mark structure as freed */ | 1295 | /* Mark structure as freed */ |
1291 | lock_buffer(od->dq_chunk->qc_headerbh); | 1296 | lock_buffer(od->dq_chunk->qc_headerbh); |
1292 | ocfs2_clear_bit(offset, dchunk->dqc_bitmap); | 1297 | ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap); |
1293 | le32_add_cpu(&dchunk->dqc_free, 1); | 1298 | le32_add_cpu(&dchunk->dqc_free, 1); |
1294 | unlock_buffer(od->dq_chunk->qc_headerbh); | 1299 | unlock_buffer(od->dq_chunk->qc_headerbh); |
1295 | ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); | 1300 | ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 26fc0014d50..1424c151ccc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb) | |||
493 | goto bail; | 493 | goto bail; |
494 | } | 494 | } |
495 | } else | 495 | } else |
496 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", | 496 | printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " |
497 | slot); | 497 | "allocated to this node!\n", slot, osb->dev_str); |
498 | 498 | ||
499 | ocfs2_set_slot(si, slot, osb->node_num); | 499 | ocfs2_set_slot(si, slot, osb->node_num); |
500 | osb->slot_num = slot; | 500 | osb->slot_num = slot; |
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 19965b00c43..94368017edb 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "cluster/masklog.h" | 28 | #include "cluster/masklog.h" |
29 | #include "cluster/nodemanager.h" | 29 | #include "cluster/nodemanager.h" |
30 | #include "cluster/heartbeat.h" | 30 | #include "cluster/heartbeat.h" |
31 | #include "cluster/tcp.h" | ||
31 | 32 | ||
32 | #include "stackglue.h" | 33 | #include "stackglue.h" |
33 | 34 | ||
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb) | |||
256 | } | 257 | } |
257 | 258 | ||
258 | /* | 259 | /* |
260 | * Check if this node is heartbeating and is connected to all other | ||
261 | * heartbeating nodes. | ||
262 | */ | ||
263 | static int o2cb_cluster_check(void) | ||
264 | { | ||
265 | u8 node_num; | ||
266 | int i; | ||
267 | unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
268 | unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
269 | |||
270 | node_num = o2nm_this_node(); | ||
271 | if (node_num == O2NM_MAX_NODES) { | ||
272 | printk(KERN_ERR "o2cb: This node has not been configured.\n"); | ||
273 | return -EINVAL; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * o2dlm expects o2net sockets to be created. If not, then | ||
278 | * dlm_join_domain() fails with a stack of errors which are both cryptic | ||
279 | * and incomplete. The idea here is to detect upfront whether we have | ||
280 | * managed to connect to all nodes or not. If not, then list the nodes | ||
281 | * to allow the user to check the configuration (incorrect IP, firewall, | ||
282 | * etc.) Yes, this is racy. But its not the end of the world. | ||
283 | */ | ||
284 | #define O2CB_MAP_STABILIZE_COUNT 60 | ||
285 | for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) { | ||
286 | o2hb_fill_node_map(hbmap, sizeof(hbmap)); | ||
287 | if (!test_bit(node_num, hbmap)) { | ||
288 | printk(KERN_ERR "o2cb: %s heartbeat has not been " | ||
289 | "started.\n", (o2hb_global_heartbeat_active() ? | ||
290 | "Global" : "Local")); | ||
291 | return -EINVAL; | ||
292 | } | ||
293 | o2net_fill_node_map(netmap, sizeof(netmap)); | ||
294 | /* Force set the current node to allow easy compare */ | ||
295 | set_bit(node_num, netmap); | ||
296 | if (!memcmp(hbmap, netmap, sizeof(hbmap))) | ||
297 | return 0; | ||
298 | if (i < O2CB_MAP_STABILIZE_COUNT) | ||
299 | msleep(1000); | ||
300 | } | ||
301 | |||
302 | printk(KERN_ERR "o2cb: This node could not connect to nodes:"); | ||
303 | i = -1; | ||
304 | while ((i = find_next_bit(hbmap, O2NM_MAX_NODES, | ||
305 | i + 1)) < O2NM_MAX_NODES) { | ||
306 | if (!test_bit(i, netmap)) | ||
307 | printk(" %u", i); | ||
308 | } | ||
309 | printk(".\n"); | ||
310 | |||
311 | return -ENOTCONN; | ||
312 | } | ||
313 | |||
314 | /* | ||
259 | * Called from the dlm when it's about to evict a node. This is how the | 315 | * Called from the dlm when it's about to evict a node. This is how the |
260 | * classic stack signals node death. | 316 | * classic stack signals node death. |
261 | */ | 317 | */ |
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data) | |||
263 | { | 319 | { |
264 | struct ocfs2_cluster_connection *conn = data; | 320 | struct ocfs2_cluster_connection *conn = data; |
265 | 321 | ||
266 | mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", | 322 | printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n", |
267 | node_num, conn->cc_namelen, conn->cc_name); | 323 | node_num, conn->cc_namelen, conn->cc_name); |
268 | 324 | ||
269 | conn->cc_recovery_handler(node_num, conn->cc_recovery_data); | 325 | conn->cc_recovery_handler(node_num, conn->cc_recovery_data); |
270 | } | 326 | } |
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) | |||
280 | BUG_ON(conn == NULL); | 336 | BUG_ON(conn == NULL); |
281 | BUG_ON(conn->cc_proto == NULL); | 337 | BUG_ON(conn->cc_proto == NULL); |
282 | 338 | ||
283 | /* for now we only have one cluster/node, make sure we see it | 339 | /* Ensure cluster stack is up and all nodes are connected */ |
284 | * in the heartbeat universe */ | 340 | rc = o2cb_cluster_check(); |
285 | if (!o2hb_check_local_node_heartbeating()) { | 341 | if (rc) { |
286 | if (o2hb_global_heartbeat_active()) | 342 | printk(KERN_ERR "o2cb: Cluster check failed. Fix errors " |
287 | mlog(ML_ERROR, "Global heartbeat not started\n"); | 343 | "before retrying.\n"); |
288 | rc = -EINVAL; | ||
289 | goto out; | 344 | goto out; |
290 | } | 345 | } |
291 | 346 | ||
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f61027236..4994f8b0e60 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include "ocfs1_fs_compat.h" | 54 | #include "ocfs1_fs_compat.h" |
55 | 55 | ||
56 | #include "alloc.h" | 56 | #include "alloc.h" |
57 | #include "aops.h" | ||
57 | #include "blockcheck.h" | 58 | #include "blockcheck.h" |
58 | #include "dlmglue.h" | 59 | #include "dlmglue.h" |
59 | #include "export.h" | 60 | #include "export.h" |
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
1107 | 1108 | ||
1108 | ocfs2_set_ro_flag(osb, 1); | 1109 | ocfs2_set_ro_flag(osb, 1); |
1109 | 1110 | ||
1110 | printk(KERN_NOTICE "Readonly device detected. No cluster " | 1111 | printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. " |
1111 | "services will be utilized for this mount. Recovery " | 1112 | "Cluster services will not be used for this mount. " |
1112 | "will be skipped.\n"); | 1113 | "Recovery will be skipped.\n", osb->dev_str); |
1113 | } | 1114 | } |
1114 | 1115 | ||
1115 | if (!ocfs2_is_hard_readonly(osb)) { | 1116 | if (!ocfs2_is_hard_readonly(osb)) { |
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
1616 | return 0; | 1617 | return 0; |
1617 | } | 1618 | } |
1618 | 1619 | ||
1620 | wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ]; | ||
1621 | |||
1619 | static int __init ocfs2_init(void) | 1622 | static int __init ocfs2_init(void) |
1620 | { | 1623 | { |
1621 | int status; | 1624 | int status, i; |
1622 | 1625 | ||
1623 | ocfs2_print_version(); | 1626 | ocfs2_print_version(); |
1624 | 1627 | ||
1628 | for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++) | ||
1629 | init_waitqueue_head(&ocfs2__ioend_wq[i]); | ||
1630 | |||
1625 | status = init_ocfs2_uptodate_cache(); | 1631 | status = init_ocfs2_uptodate_cache(); |
1626 | if (status < 0) { | 1632 | if (status < 0) { |
1627 | mlog_errno(status); | 1633 | mlog_errno(status); |
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data) | |||
1760 | ocfs2_extent_map_init(&oi->vfs_inode); | 1766 | ocfs2_extent_map_init(&oi->vfs_inode); |
1761 | INIT_LIST_HEAD(&oi->ip_io_markers); | 1767 | INIT_LIST_HEAD(&oi->ip_io_markers); |
1762 | oi->ip_dir_start_lookup = 0; | 1768 | oi->ip_dir_start_lookup = 0; |
1763 | 1769 | atomic_set(&oi->ip_unaligned_aio, 0); | |
1764 | init_rwsem(&oi->ip_alloc_sem); | 1770 | init_rwsem(&oi->ip_alloc_sem); |
1765 | init_rwsem(&oi->ip_xattr_sem); | 1771 | init_rwsem(&oi->ip_xattr_sem); |
1766 | mutex_init(&oi->ip_io_mutex); | 1772 | mutex_init(&oi->ip_io_mutex); |
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1974 | * If we failed before we got a uuid_str yet, we can't stop | 1980 | * If we failed before we got a uuid_str yet, we can't stop |
1975 | * heartbeat. Otherwise, do it. | 1981 | * heartbeat. Otherwise, do it. |
1976 | */ | 1982 | */ |
1977 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) | 1983 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str && |
1984 | !ocfs2_is_hard_readonly(osb)) | ||
1978 | hangup_needed = 1; | 1985 | hangup_needed = 1; |
1979 | 1986 | ||
1980 | if (osb->cconn) | 1987 | if (osb->cconn) |
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2353 | mlog_errno(status); | 2360 | mlog_errno(status); |
2354 | goto bail; | 2361 | goto bail; |
2355 | } | 2362 | } |
2356 | cleancache_init_shared_fs((char *)&uuid_net_key, sb); | 2363 | cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); |
2357 | 2364 | ||
2358 | bail: | 2365 | bail: |
2359 | return status; | 2366 | return status; |
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) | |||
2462 | goto finally; | 2469 | goto finally; |
2463 | } | 2470 | } |
2464 | } else { | 2471 | } else { |
2465 | mlog(ML_NOTICE, "File system was not unmounted cleanly, " | 2472 | printk(KERN_NOTICE "ocfs2: File system on device (%s) was not " |
2466 | "recovering volume.\n"); | 2473 | "unmounted cleanly, recovering it.\n", osb->dev_str); |
2467 | } | 2474 | } |
2468 | 2475 | ||
2469 | local = ocfs2_mount_local(osb); | 2476 | local = ocfs2_mount_local(osb); |
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 194fb22ef79..aa9e8777b09 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c | |||
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode, | |||
2376 | } | 2376 | } |
2377 | 2377 | ||
2378 | ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); | 2378 | ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); |
2379 | if (ret < 0) { | ||
2380 | mlog_errno(ret); | ||
2381 | break; | ||
2382 | } | ||
2383 | 2379 | ||
2384 | ocfs2_commit_trans(osb, ctxt.handle); | 2380 | ocfs2_commit_trans(osb, ctxt.handle); |
2385 | if (ctxt.meta_ac) { | 2381 | if (ctxt.meta_ac) { |
2386 | ocfs2_free_alloc_context(ctxt.meta_ac); | 2382 | ocfs2_free_alloc_context(ctxt.meta_ac); |
2387 | ctxt.meta_ac = NULL; | 2383 | ctxt.meta_ac = NULL; |
2388 | } | 2384 | } |
2385 | |||
2386 | if (ret < 0) { | ||
2387 | mlog_errno(ret); | ||
2388 | break; | ||
2389 | } | ||
2390 | |||
2389 | } | 2391 | } |
2390 | 2392 | ||
2391 | if (ctxt.meta_ac) | 2393 | if (ctxt.meta_ac) |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 2db1bd3173b..851ba3dcdc2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -1652,46 +1652,12 @@ out: | |||
1652 | return error; | 1652 | return error; |
1653 | } | 1653 | } |
1654 | 1654 | ||
1655 | static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
1656 | struct kstat *stat) | ||
1657 | { | ||
1658 | struct inode *inode = dentry->d_inode; | ||
1659 | struct task_struct *task = get_proc_task(inode); | ||
1660 | int rc; | ||
1661 | |||
1662 | if (task == NULL) | ||
1663 | return -ESRCH; | ||
1664 | |||
1665 | rc = -EACCES; | ||
1666 | if (lock_trace(task)) | ||
1667 | goto out_task; | ||
1668 | |||
1669 | generic_fillattr(inode, stat); | ||
1670 | unlock_trace(task); | ||
1671 | rc = 0; | ||
1672 | out_task: | ||
1673 | put_task_struct(task); | ||
1674 | return rc; | ||
1675 | } | ||
1676 | |||
1677 | static const struct inode_operations proc_pid_link_inode_operations = { | 1655 | static const struct inode_operations proc_pid_link_inode_operations = { |
1678 | .readlink = proc_pid_readlink, | 1656 | .readlink = proc_pid_readlink, |
1679 | .follow_link = proc_pid_follow_link, | 1657 | .follow_link = proc_pid_follow_link, |
1680 | .setattr = proc_setattr, | 1658 | .setattr = proc_setattr, |
1681 | }; | 1659 | }; |
1682 | 1660 | ||
1683 | static const struct inode_operations proc_fdinfo_link_inode_operations = { | ||
1684 | .setattr = proc_setattr, | ||
1685 | .getattr = proc_pid_fd_link_getattr, | ||
1686 | }; | ||
1687 | |||
1688 | static const struct inode_operations proc_fd_link_inode_operations = { | ||
1689 | .readlink = proc_pid_readlink, | ||
1690 | .follow_link = proc_pid_follow_link, | ||
1691 | .setattr = proc_setattr, | ||
1692 | .getattr = proc_pid_fd_link_getattr, | ||
1693 | }; | ||
1694 | |||
1695 | 1661 | ||
1696 | /* building an inode */ | 1662 | /* building an inode */ |
1697 | 1663 | ||
@@ -1923,61 +1889,49 @@ out: | |||
1923 | 1889 | ||
1924 | static int proc_fd_info(struct inode *inode, struct path *path, char *info) | 1890 | static int proc_fd_info(struct inode *inode, struct path *path, char *info) |
1925 | { | 1891 | { |
1926 | struct task_struct *task; | 1892 | struct task_struct *task = get_proc_task(inode); |
1927 | struct files_struct *files; | 1893 | struct files_struct *files = NULL; |
1928 | struct file *file; | 1894 | struct file *file; |
1929 | int fd = proc_fd(inode); | 1895 | int fd = proc_fd(inode); |
1930 | int rc; | ||
1931 | |||
1932 | task = get_proc_task(inode); | ||
1933 | if (!task) | ||
1934 | return -ENOENT; | ||
1935 | |||
1936 | rc = -EACCES; | ||
1937 | if (lock_trace(task)) | ||
1938 | goto out_task; | ||
1939 | |||
1940 | rc = -ENOENT; | ||
1941 | files = get_files_struct(task); | ||
1942 | if (files == NULL) | ||
1943 | goto out_unlock; | ||
1944 | 1896 | ||
1945 | /* | 1897 | if (task) { |
1946 | * We are not taking a ref to the file structure, so we must | 1898 | files = get_files_struct(task); |
1947 | * hold ->file_lock. | 1899 | put_task_struct(task); |
1948 | */ | 1900 | } |
1949 | spin_lock(&files->file_lock); | 1901 | if (files) { |
1950 | file = fcheck_files(files, fd); | 1902 | /* |
1951 | if (file) { | 1903 | * We are not taking a ref to the file structure, so we must |
1952 | unsigned int f_flags; | 1904 | * hold ->file_lock. |
1953 | struct fdtable *fdt; | 1905 | */ |
1954 | 1906 | spin_lock(&files->file_lock); | |
1955 | fdt = files_fdtable(files); | 1907 | file = fcheck_files(files, fd); |
1956 | f_flags = file->f_flags & ~O_CLOEXEC; | 1908 | if (file) { |
1957 | if (FD_ISSET(fd, fdt->close_on_exec)) | 1909 | unsigned int f_flags; |
1958 | f_flags |= O_CLOEXEC; | 1910 | struct fdtable *fdt; |
1959 | 1911 | ||
1960 | if (path) { | 1912 | fdt = files_fdtable(files); |
1961 | *path = file->f_path; | 1913 | f_flags = file->f_flags & ~O_CLOEXEC; |
1962 | path_get(&file->f_path); | 1914 | if (FD_ISSET(fd, fdt->close_on_exec)) |
1915 | f_flags |= O_CLOEXEC; | ||
1916 | |||
1917 | if (path) { | ||
1918 | *path = file->f_path; | ||
1919 | path_get(&file->f_path); | ||
1920 | } | ||
1921 | if (info) | ||
1922 | snprintf(info, PROC_FDINFO_MAX, | ||
1923 | "pos:\t%lli\n" | ||
1924 | "flags:\t0%o\n", | ||
1925 | (long long) file->f_pos, | ||
1926 | f_flags); | ||
1927 | spin_unlock(&files->file_lock); | ||
1928 | put_files_struct(files); | ||
1929 | return 0; | ||
1963 | } | 1930 | } |
1964 | if (info) | 1931 | spin_unlock(&files->file_lock); |
1965 | snprintf(info, PROC_FDINFO_MAX, | 1932 | put_files_struct(files); |
1966 | "pos:\t%lli\n" | 1933 | } |
1967 | "flags:\t0%o\n", | 1934 | return -ENOENT; |
1968 | (long long) file->f_pos, | ||
1969 | f_flags); | ||
1970 | rc = 0; | ||
1971 | } else | ||
1972 | rc = -ENOENT; | ||
1973 | spin_unlock(&files->file_lock); | ||
1974 | put_files_struct(files); | ||
1975 | |||
1976 | out_unlock: | ||
1977 | unlock_trace(task); | ||
1978 | out_task: | ||
1979 | put_task_struct(task); | ||
1980 | return rc; | ||
1981 | } | 1935 | } |
1982 | 1936 | ||
1983 | static int proc_fd_link(struct inode *inode, struct path *path) | 1937 | static int proc_fd_link(struct inode *inode, struct path *path) |
@@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, | |||
2072 | spin_unlock(&files->file_lock); | 2026 | spin_unlock(&files->file_lock); |
2073 | put_files_struct(files); | 2027 | put_files_struct(files); |
2074 | 2028 | ||
2075 | inode->i_op = &proc_fd_link_inode_operations; | 2029 | inode->i_op = &proc_pid_link_inode_operations; |
2076 | inode->i_size = 64; | 2030 | inode->i_size = 64; |
2077 | ei->op.proc_get_link = proc_fd_link; | 2031 | ei->op.proc_get_link = proc_fd_link; |
2078 | d_set_d_op(dentry, &tid_fd_dentry_operations); | 2032 | d_set_d_op(dentry, &tid_fd_dentry_operations); |
@@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, | |||
2104 | if (fd == ~0U) | 2058 | if (fd == ~0U) |
2105 | goto out; | 2059 | goto out; |
2106 | 2060 | ||
2107 | result = ERR_PTR(-EACCES); | ||
2108 | if (lock_trace(task)) | ||
2109 | goto out; | ||
2110 | |||
2111 | result = instantiate(dir, dentry, task, &fd); | 2061 | result = instantiate(dir, dentry, task, &fd); |
2112 | unlock_trace(task); | ||
2113 | out: | 2062 | out: |
2114 | put_task_struct(task); | 2063 | put_task_struct(task); |
2115 | out_no_task: | 2064 | out_no_task: |
@@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent, | |||
2129 | retval = -ENOENT; | 2078 | retval = -ENOENT; |
2130 | if (!p) | 2079 | if (!p) |
2131 | goto out_no_task; | 2080 | goto out_no_task; |
2132 | |||
2133 | retval = -EACCES; | ||
2134 | if (lock_trace(p)) | ||
2135 | goto out; | ||
2136 | |||
2137 | retval = 0; | 2081 | retval = 0; |
2138 | 2082 | ||
2139 | fd = filp->f_pos; | 2083 | fd = filp->f_pos; |
2140 | switch (fd) { | 2084 | switch (fd) { |
2141 | case 0: | 2085 | case 0: |
2142 | if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) | 2086 | if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) |
2143 | goto out_unlock; | 2087 | goto out; |
2144 | filp->f_pos++; | 2088 | filp->f_pos++; |
2145 | case 1: | 2089 | case 1: |
2146 | ino = parent_ino(dentry); | 2090 | ino = parent_ino(dentry); |
2147 | if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) | 2091 | if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) |
2148 | goto out_unlock; | 2092 | goto out; |
2149 | filp->f_pos++; | 2093 | filp->f_pos++; |
2150 | default: | 2094 | default: |
2151 | files = get_files_struct(p); | 2095 | files = get_files_struct(p); |
2152 | if (!files) | 2096 | if (!files) |
2153 | goto out_unlock; | 2097 | goto out; |
2154 | rcu_read_lock(); | 2098 | rcu_read_lock(); |
2155 | for (fd = filp->f_pos-2; | 2099 | for (fd = filp->f_pos-2; |
2156 | fd < files_fdtable(files)->max_fds; | 2100 | fd < files_fdtable(files)->max_fds; |
@@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, | |||
2174 | rcu_read_unlock(); | 2118 | rcu_read_unlock(); |
2175 | put_files_struct(files); | 2119 | put_files_struct(files); |
2176 | } | 2120 | } |
2177 | |||
2178 | out_unlock: | ||
2179 | unlock_trace(p); | ||
2180 | out: | 2121 | out: |
2181 | put_task_struct(p); | 2122 | put_task_struct(p); |
2182 | out_no_task: | 2123 | out_no_task: |
@@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, | |||
2254 | ei->fd = fd; | 2195 | ei->fd = fd; |
2255 | inode->i_mode = S_IFREG | S_IRUSR; | 2196 | inode->i_mode = S_IFREG | S_IRUSR; |
2256 | inode->i_fop = &proc_fdinfo_file_operations; | 2197 | inode->i_fop = &proc_fdinfo_file_operations; |
2257 | inode->i_op = &proc_fdinfo_link_inode_operations; | ||
2258 | d_set_d_op(dentry, &tid_fd_dentry_operations); | 2198 | d_set_d_op(dentry, &tid_fd_dentry_operations); |
2259 | d_add(dentry, inode); | 2199 | d_add(dentry, inode); |
2260 | /* Close the race of the process dying before we return the dentry */ | 2200 | /* Close the race of the process dying before we return the dentry */ |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 586174168e2..80e4645f799 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -131,12 +131,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
131 | K(i.freeswap), | 131 | K(i.freeswap), |
132 | K(global_page_state(NR_FILE_DIRTY)), | 132 | K(global_page_state(NR_FILE_DIRTY)), |
133 | K(global_page_state(NR_WRITEBACK)), | 133 | K(global_page_state(NR_WRITEBACK)), |
134 | K(global_page_state(NR_ANON_PAGES) | ||
135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 134 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
135 | K(global_page_state(NR_ANON_PAGES) | ||
136 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | 136 | + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * |
137 | HPAGE_PMD_NR | 137 | HPAGE_PMD_NR), |
138 | #else | ||
139 | K(global_page_state(NR_ANON_PAGES)), | ||
138 | #endif | 140 | #endif |
139 | ), | ||
140 | K(global_page_state(NR_FILE_MAPPED)), | 141 | K(global_page_state(NR_FILE_MAPPED)), |
141 | K(global_page_state(NR_SHMEM)), | 142 | K(global_page_state(NR_SHMEM)), |
142 | K(global_page_state(NR_SLAB_RECLAIMABLE) + | 143 | K(global_page_state(NR_SLAB_RECLAIMABLE) + |
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 42b274da92c..2a30d67dd6b 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c | |||
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) | |||
32 | idle = kstat_cpu(cpu).cpustat.idle; | 32 | idle = kstat_cpu(cpu).cpustat.idle; |
33 | idle = cputime64_add(idle, arch_idle_time(cpu)); | 33 | idle = cputime64_add(idle, arch_idle_time(cpu)); |
34 | } else | 34 | } else |
35 | idle = usecs_to_cputime(idle_time); | 35 | idle = nsecs_to_jiffies64(1000 * idle_time); |
36 | 36 | ||
37 | return idle; | 37 | return idle; |
38 | } | 38 | } |
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) | |||
46 | /* !NO_HZ so we can rely on cpustat.iowait */ | 46 | /* !NO_HZ so we can rely on cpustat.iowait */ |
47 | iowait = kstat_cpu(cpu).cpustat.iowait; | 47 | iowait = kstat_cpu(cpu).cpustat.iowait; |
48 | else | 48 | else |
49 | iowait = usecs_to_cputime(iowait_time); | 49 | iowait = nsecs_to_jiffies64(1000 * iowait_time); |
50 | 50 | ||
51 | return iowait; | 51 | return iowait; |
52 | } | 52 | } |
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cd99bf55765..b0f450a2bb7 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/user.h> | 12 | #include <linux/user.h> |
13 | #include <linux/elf.h> | 13 | #include <linux/elf.h> |
14 | #include <linux/elfcore.h> | 14 | #include <linux/elfcore.h> |
15 | #include <linux/export.h> | ||
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/highmem.h> | 17 | #include <linux/highmem.h> |
17 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 2bd620f0d79..57bbf9078ac 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c | |||
@@ -167,6 +167,7 @@ int pstore_register(struct pstore_info *psi) | |||
167 | } | 167 | } |
168 | 168 | ||
169 | psinfo = psi; | 169 | psinfo = psi; |
170 | mutex_init(&psinfo->read_mutex); | ||
170 | spin_unlock(&pstore_lock); | 171 | spin_unlock(&pstore_lock); |
171 | 172 | ||
172 | if (owner && !try_module_get(owner)) { | 173 | if (owner && !try_module_get(owner)) { |
@@ -195,30 +196,32 @@ EXPORT_SYMBOL_GPL(pstore_register); | |||
195 | void pstore_get_records(int quiet) | 196 | void pstore_get_records(int quiet) |
196 | { | 197 | { |
197 | struct pstore_info *psi = psinfo; | 198 | struct pstore_info *psi = psinfo; |
199 | char *buf = NULL; | ||
198 | ssize_t size; | 200 | ssize_t size; |
199 | u64 id; | 201 | u64 id; |
200 | enum pstore_type_id type; | 202 | enum pstore_type_id type; |
201 | struct timespec time; | 203 | struct timespec time; |
202 | int failed = 0, rc; | 204 | int failed = 0, rc; |
203 | unsigned long flags; | ||
204 | 205 | ||
205 | if (!psi) | 206 | if (!psi) |
206 | return; | 207 | return; |
207 | 208 | ||
208 | spin_lock_irqsave(&psinfo->buf_lock, flags); | 209 | mutex_lock(&psi->read_mutex); |
209 | rc = psi->open(psi); | 210 | rc = psi->open(psi); |
210 | if (rc) | 211 | if (rc) |
211 | goto out; | 212 | goto out; |
212 | 213 | ||
213 | while ((size = psi->read(&id, &type, &time, psi)) > 0) { | 214 | while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) { |
214 | rc = pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, | 215 | rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size, |
215 | time, psi); | 216 | time, psi); |
217 | kfree(buf); | ||
218 | buf = NULL; | ||
216 | if (rc && (rc != -EEXIST || !quiet)) | 219 | if (rc && (rc != -EEXIST || !quiet)) |
217 | failed++; | 220 | failed++; |
218 | } | 221 | } |
219 | psi->close(psi); | 222 | psi->close(psi); |
220 | out: | 223 | out: |
221 | spin_unlock_irqrestore(&psinfo->buf_lock, flags); | 224 | mutex_unlock(&psi->read_mutex); |
222 | 225 | ||
223 | if (failed) | 226 | if (failed) |
224 | printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", | 227 | printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n", |
diff --git a/fs/quota/quota.c b/fs/quota/quota.c index aae0edb95c6..35f4b0ecdeb 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c | |||
@@ -286,7 +286,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, | |||
286 | /* caller already holds s_umount */ | 286 | /* caller already holds s_umount */ |
287 | if (sb->s_flags & MS_RDONLY) | 287 | if (sb->s_flags & MS_RDONLY) |
288 | return -EROFS; | 288 | return -EROFS; |
289 | writeback_inodes_sb(sb); | 289 | writeback_inodes_sb(sb, WB_REASON_SYNC); |
290 | return 0; | 290 | return 0; |
291 | default: | 291 | default: |
292 | return -EINVAL; | 292 | return -EINVAL; |
diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c9..dba43c3ea3a 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c | |||
@@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path); | |||
449 | 449 | ||
450 | /* | 450 | /* |
451 | * Same as seq_path, but relative to supplied root. | 451 | * Same as seq_path, but relative to supplied root. |
452 | * | ||
453 | * root may be changed, see __d_path(). | ||
454 | */ | 452 | */ |
455 | int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | 453 | int seq_path_root(struct seq_file *m, struct path *path, struct path *root, |
456 | char *esc) | 454 | char *esc) |
@@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | |||
463 | char *p; | 461 | char *p; |
464 | 462 | ||
465 | p = __d_path(path, root, buf, size); | 463 | p = __d_path(path, root, buf, size); |
464 | if (!p) | ||
465 | return SEQ_SKIP; | ||
466 | res = PTR_ERR(p); | 466 | res = PTR_ERR(p); |
467 | if (!IS_ERR(p)) { | 467 | if (!IS_ERR(p)) { |
468 | char *end = mangle_path(buf, p, esc); | 468 | char *end = mangle_path(buf, p, esc); |
@@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, | |||
474 | } | 474 | } |
475 | seq_commit(m, res); | 475 | seq_commit(m, res); |
476 | 476 | ||
477 | return res < 0 ? res : 0; | 477 | return res < 0 && res != -ENAMETOOLONG ? res : 0; |
478 | } | 478 | } |
479 | 479 | ||
480 | /* | 480 | /* |
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index 048b59d5b2f..c70111ebefd 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig | |||
@@ -78,6 +78,28 @@ config SQUASHFS_XZ | |||
78 | 78 | ||
79 | If unsure, say N. | 79 | If unsure, say N. |
80 | 80 | ||
81 | config SQUASHFS_4K_DEVBLK_SIZE | ||
82 | bool "Use 4K device block size?" | ||
83 | depends on SQUASHFS | ||
84 | help | ||
85 | By default Squashfs sets the dev block size (sb_min_blocksize) | ||
86 | to 1K or the smallest block size supported by the block device | ||
87 | (if larger). This, because blocks are packed together and | ||
88 | unaligned in Squashfs, should reduce latency. | ||
89 | |||
90 | This, however, gives poor performance on MTD NAND devices where | ||
91 | the optimal I/O size is 4K (even though the devices can support | ||
92 | smaller block sizes). | ||
93 | |||
94 | Using a 4K device block size may also improve overall I/O | ||
95 | performance for some file access patterns (e.g. sequential | ||
96 | accesses of files in filesystem order) on all media. | ||
97 | |||
98 | Setting this option will force Squashfs to use a 4K device block | ||
99 | size by default. | ||
100 | |||
101 | If unsure, say N. | ||
102 | |||
81 | config SQUASHFS_EMBEDDED | 103 | config SQUASHFS_EMBEDDED |
82 | bool "Additional option for memory-constrained systems" | 104 | bool "Additional option for memory-constrained systems" |
83 | depends on SQUASHFS | 105 | depends on SQUASHFS |
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index b4a4e539a08..e8e14645de9 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h | |||
@@ -36,6 +36,13 @@ | |||
36 | #define SQUASHFS_FILE_SIZE 131072 | 36 | #define SQUASHFS_FILE_SIZE 131072 |
37 | #define SQUASHFS_FILE_LOG 17 | 37 | #define SQUASHFS_FILE_LOG 17 |
38 | 38 | ||
39 | /* default size of block device I/O */ | ||
40 | #ifdef CONFIG_SQUASHFS_4K_DEVBLK_SIZE | ||
41 | #define SQUASHFS_DEVBLK_SIZE 4096 | ||
42 | #else | ||
43 | #define SQUASHFS_DEVBLK_SIZE 1024 | ||
44 | #endif | ||
45 | |||
39 | #define SQUASHFS_FILE_MAX_SIZE 1048576 | 46 | #define SQUASHFS_FILE_MAX_SIZE 1048576 |
40 | #define SQUASHFS_FILE_MAX_LOG 20 | 47 | #define SQUASHFS_FILE_MAX_LOG 20 |
41 | 48 | ||
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 7438850c62d..2da1715452a 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c | |||
@@ -95,7 +95,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) | |||
95 | } | 95 | } |
96 | msblk = sb->s_fs_info; | 96 | msblk = sb->s_fs_info; |
97 | 97 | ||
98 | msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); | 98 | msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); |
99 | msblk->devblksize_log2 = ffz(~msblk->devblksize); | 99 | msblk->devblksize_log2 = ffz(~msblk->devblksize); |
100 | 100 | ||
101 | mutex_init(&msblk->read_data_mutex); | 101 | mutex_init(&msblk->read_data_mutex); |
diff --git a/fs/statfs.c b/fs/statfs.c index 8244924dec5..9cf04a11896 100644 --- a/fs/statfs.c +++ b/fs/statfs.c | |||
@@ -76,7 +76,7 @@ EXPORT_SYMBOL(vfs_statfs); | |||
76 | int user_statfs(const char __user *pathname, struct kstatfs *st) | 76 | int user_statfs(const char __user *pathname, struct kstatfs *st) |
77 | { | 77 | { |
78 | struct path path; | 78 | struct path path; |
79 | int error = user_path(pathname, &path); | 79 | int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); |
80 | if (!error) { | 80 | if (!error) { |
81 | error = vfs_statfs(&path, st); | 81 | error = vfs_statfs(&path, st); |
82 | path_put(&path); | 82 | path_put(&path); |
@@ -43,7 +43,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) | |||
43 | if (wait) | 43 | if (wait) |
44 | sync_inodes_sb(sb); | 44 | sync_inodes_sb(sb); |
45 | else | 45 | else |
46 | writeback_inodes_sb(sb); | 46 | writeback_inodes_sb(sb, WB_REASON_SYNC); |
47 | 47 | ||
48 | if (sb->s_op->sync_fs) | 48 | if (sb->s_op->sync_fs) |
49 | sb->s_op->sync_fs(sb, wait); | 49 | sb->s_op->sync_fs(sb, wait); |
@@ -98,7 +98,7 @@ static void sync_filesystems(int wait) | |||
98 | */ | 98 | */ |
99 | SYSCALL_DEFINE0(sync) | 99 | SYSCALL_DEFINE0(sync) |
100 | { | 100 | { |
101 | wakeup_flusher_threads(0); | 101 | wakeup_flusher_threads(0, WB_REASON_SYNC); |
102 | sync_filesystems(0); | 102 | sync_filesystems(0); |
103 | sync_filesystems(1); | 103 | sync_filesystems(1); |
104 | if (unlikely(laptop_mode)) | 104 | if (unlikely(laptop_mode)) |
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 315de66e52b..bc4f94b2870 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c | |||
@@ -63,7 +63,7 @@ | |||
63 | static void shrink_liability(struct ubifs_info *c, int nr_to_write) | 63 | static void shrink_liability(struct ubifs_info *c, int nr_to_write) |
64 | { | 64 | { |
65 | down_read(&c->vfs_sb->s_umount); | 65 | down_read(&c->vfs_sb->s_umount); |
66 | writeback_inodes_sb(c->vfs_sb); | 66 | writeback_inodes_sb(c->vfs_sb, WB_REASON_FS_FREE_SPACE); |
67 | up_read(&c->vfs_sb->s_umount); | 67 | up_read(&c->vfs_sb->s_umount); |
68 | } | 68 | } |
69 | 69 | ||
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef109a1a92..b09ba2dd8b6 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c | |||
@@ -870,6 +870,22 @@ void dbg_dump_lpt_info(struct ubifs_info *c) | |||
870 | spin_unlock(&dbg_lock); | 870 | spin_unlock(&dbg_lock); |
871 | } | 871 | } |
872 | 872 | ||
873 | void dbg_dump_sleb(const struct ubifs_info *c, | ||
874 | const struct ubifs_scan_leb *sleb, int offs) | ||
875 | { | ||
876 | struct ubifs_scan_node *snod; | ||
877 | |||
878 | printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n", | ||
879 | current->pid, sleb->lnum, offs); | ||
880 | |||
881 | list_for_each_entry(snod, &sleb->nodes, list) { | ||
882 | cond_resched(); | ||
883 | printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum, | ||
884 | snod->offs, snod->len); | ||
885 | dbg_dump_node(c, snod->node); | ||
886 | } | ||
887 | } | ||
888 | |||
873 | void dbg_dump_leb(const struct ubifs_info *c, int lnum) | 889 | void dbg_dump_leb(const struct ubifs_info *c, int lnum) |
874 | { | 890 | { |
875 | struct ubifs_scan_leb *sleb; | 891 | struct ubifs_scan_leb *sleb; |
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index feb361e252a..8d9c4681018 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h | |||
@@ -269,6 +269,8 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); | |||
269 | void dbg_dump_lprops(struct ubifs_info *c); | 269 | void dbg_dump_lprops(struct ubifs_info *c); |
270 | void dbg_dump_lpt_info(struct ubifs_info *c); | 270 | void dbg_dump_lpt_info(struct ubifs_info *c); |
271 | void dbg_dump_leb(const struct ubifs_info *c, int lnum); | 271 | void dbg_dump_leb(const struct ubifs_info *c, int lnum); |
272 | void dbg_dump_sleb(const struct ubifs_info *c, | ||
273 | const struct ubifs_scan_leb *sleb, int offs); | ||
272 | void dbg_dump_znode(const struct ubifs_info *c, | 274 | void dbg_dump_znode(const struct ubifs_info *c, |
273 | const struct ubifs_znode *znode); | 275 | const struct ubifs_znode *znode); |
274 | void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); | 276 | void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); |
@@ -387,6 +389,9 @@ static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; } | |||
387 | static inline void dbg_dump_leb(const struct ubifs_info *c, | 389 | static inline void dbg_dump_leb(const struct ubifs_info *c, |
388 | int lnum) { return; } | 390 | int lnum) { return; } |
389 | static inline void | 391 | static inline void |
392 | dbg_dump_sleb(const struct ubifs_info *c, | ||
393 | const struct ubifs_scan_leb *sleb, int offs) { return; } | ||
394 | static inline void | ||
390 | dbg_dump_znode(const struct ubifs_info *c, | 395 | dbg_dump_znode(const struct ubifs_info *c, |
391 | const struct ubifs_znode *znode) { return; } | 396 | const struct ubifs_znode *znode) { return; } |
392 | static inline void dbg_dump_heap(struct ubifs_info *c, | 397 | static inline void dbg_dump_heap(struct ubifs_info *c, |
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index af02790d932..ee4f43f4bb9 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c | |||
@@ -983,7 +983,7 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) | |||
983 | } | 983 | } |
984 | 984 | ||
985 | /** | 985 | /** |
986 | * clean_an_unclean_leb - read and write a LEB to remove corruption. | 986 | * clean_an_unclean_leb - read and write a LEB to remove corruption. |
987 | * @c: UBIFS file-system description object | 987 | * @c: UBIFS file-system description object |
988 | * @ucleb: unclean LEB information | 988 | * @ucleb: unclean LEB information |
989 | * @sbuf: LEB-sized buffer to use | 989 | * @sbuf: LEB-sized buffer to use |
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 93d938ad3d2..6094c5a5d7a 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c | |||
@@ -247,7 +247,7 @@ static int create_default_filesystem(struct ubifs_info *c) | |||
247 | mst->total_dirty = cpu_to_le64(tmp64); | 247 | mst->total_dirty = cpu_to_le64(tmp64); |
248 | 248 | ||
249 | /* The indexing LEB does not contribute to dark space */ | 249 | /* The indexing LEB does not contribute to dark space */ |
250 | tmp64 = (c->main_lebs - 1) * c->dark_wm; | 250 | tmp64 = ((long long)(c->main_lebs - 1) * c->dark_wm); |
251 | mst->total_dark = cpu_to_le64(tmp64); | 251 | mst->total_dark = cpu_to_le64(tmp64); |
252 | 252 | ||
253 | mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); | 253 | mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); |
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index b6c4b3795c4..76e4266d2e7 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c | |||
@@ -42,6 +42,8 @@ xfs_acl_from_disk(struct xfs_acl *aclp) | |||
42 | int count, i; | 42 | int count, i; |
43 | 43 | ||
44 | count = be32_to_cpu(aclp->acl_cnt); | 44 | count = be32_to_cpu(aclp->acl_cnt); |
45 | if (count > XFS_ACL_MAX_ENTRIES) | ||
46 | return ERR_PTR(-EFSCORRUPTED); | ||
45 | 47 | ||
46 | acl = posix_acl_alloc(count, GFP_KERNEL); | 48 | acl = posix_acl_alloc(count, GFP_KERNEL); |
47 | if (!acl) | 49 | if (!acl) |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 33b13310ee0..574d4ee9b62 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -189,7 +189,7 @@ xfs_end_io( | |||
189 | int error = 0; | 189 | int error = 0; |
190 | 190 | ||
191 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { | 191 | if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { |
192 | error = -EIO; | 192 | ioend->io_error = -EIO; |
193 | goto done; | 193 | goto done; |
194 | } | 194 | } |
195 | if (ioend->io_error) | 195 | if (ioend->io_error) |
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index d4906e7c978..c1b55e59655 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c | |||
@@ -110,6 +110,7 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags) | |||
110 | /* | 110 | /* |
111 | * Query whether the requested number of additional bytes of extended | 111 | * Query whether the requested number of additional bytes of extended |
112 | * attribute space will be able to fit inline. | 112 | * attribute space will be able to fit inline. |
113 | * | ||
113 | * Returns zero if not, else the di_forkoff fork offset to be used in the | 114 | * Returns zero if not, else the di_forkoff fork offset to be used in the |
114 | * literal area for attribute data once the new bytes have been added. | 115 | * literal area for attribute data once the new bytes have been added. |
115 | * | 116 | * |
@@ -122,7 +123,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
122 | int offset; | 123 | int offset; |
123 | int minforkoff; /* lower limit on valid forkoff locations */ | 124 | int minforkoff; /* lower limit on valid forkoff locations */ |
124 | int maxforkoff; /* upper limit on valid forkoff locations */ | 125 | int maxforkoff; /* upper limit on valid forkoff locations */ |
125 | int dsize; | 126 | int dsize; |
126 | xfs_mount_t *mp = dp->i_mount; | 127 | xfs_mount_t *mp = dp->i_mount; |
127 | 128 | ||
128 | offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ | 129 | offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ |
@@ -136,47 +137,60 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
136 | return (offset >= minforkoff) ? minforkoff : 0; | 137 | return (offset >= minforkoff) ? minforkoff : 0; |
137 | } | 138 | } |
138 | 139 | ||
139 | if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { | 140 | /* |
140 | if (bytes <= XFS_IFORK_ASIZE(dp)) | 141 | * If the requested numbers of bytes is smaller or equal to the |
141 | return dp->i_d.di_forkoff; | 142 | * current attribute fork size we can always proceed. |
143 | * | ||
144 | * Note that if_bytes in the data fork might actually be larger than | ||
145 | * the current data fork size is due to delalloc extents. In that | ||
146 | * case either the extent count will go down when they are converted | ||
147 | * to real extents, or the delalloc conversion will take care of the | ||
148 | * literal area rebalancing. | ||
149 | */ | ||
150 | if (bytes <= XFS_IFORK_ASIZE(dp)) | ||
151 | return dp->i_d.di_forkoff; | ||
152 | |||
153 | /* | ||
154 | * For attr2 we can try to move the forkoff if there is space in the | ||
155 | * literal area, but for the old format we are done if there is no | ||
156 | * space in the fixed attribute fork. | ||
157 | */ | ||
158 | if (!(mp->m_flags & XFS_MOUNT_ATTR2)) | ||
142 | return 0; | 159 | return 0; |
143 | } | ||
144 | 160 | ||
145 | dsize = dp->i_df.if_bytes; | 161 | dsize = dp->i_df.if_bytes; |
146 | 162 | ||
147 | switch (dp->i_d.di_format) { | 163 | switch (dp->i_d.di_format) { |
148 | case XFS_DINODE_FMT_EXTENTS: | 164 | case XFS_DINODE_FMT_EXTENTS: |
149 | /* | 165 | /* |
150 | * If there is no attr fork and the data fork is extents, | 166 | * If there is no attr fork and the data fork is extents, |
151 | * determine if creating the default attr fork will result | 167 | * determine if creating the default attr fork will result |
152 | * in the extents form migrating to btree. If so, the | 168 | * in the extents form migrating to btree. If so, the |
153 | * minimum offset only needs to be the space required for | 169 | * minimum offset only needs to be the space required for |
154 | * the btree root. | 170 | * the btree root. |
155 | */ | 171 | */ |
156 | if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > | 172 | if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > |
157 | xfs_default_attroffset(dp)) | 173 | xfs_default_attroffset(dp)) |
158 | dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); | 174 | dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); |
159 | break; | 175 | break; |
160 | |||
161 | case XFS_DINODE_FMT_BTREE: | 176 | case XFS_DINODE_FMT_BTREE: |
162 | /* | 177 | /* |
163 | * If have data btree then keep forkoff if we have one, | 178 | * If we have a data btree then keep forkoff if we have one, |
164 | * otherwise we are adding a new attr, so then we set | 179 | * otherwise we are adding a new attr, so then we set |
165 | * minforkoff to where the btree root can finish so we have | 180 | * minforkoff to where the btree root can finish so we have |
166 | * plenty of room for attrs | 181 | * plenty of room for attrs |
167 | */ | 182 | */ |
168 | if (dp->i_d.di_forkoff) { | 183 | if (dp->i_d.di_forkoff) { |
169 | if (offset < dp->i_d.di_forkoff) | 184 | if (offset < dp->i_d.di_forkoff) |
170 | return 0; | 185 | return 0; |
171 | else | 186 | return dp->i_d.di_forkoff; |
172 | return dp->i_d.di_forkoff; | 187 | } |
173 | } else | 188 | dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); |
174 | dsize = XFS_BMAP_BROOT_SPACE(dp->i_df.if_broot); | ||
175 | break; | 189 | break; |
176 | } | 190 | } |
177 | 191 | ||
178 | /* | 192 | /* |
179 | * A data fork btree root must have space for at least | 193 | * A data fork btree root must have space for at least |
180 | * MINDBTPTRS key/ptr pairs if the data fork is small or empty. | 194 | * MINDBTPTRS key/ptr pairs if the data fork is small or empty. |
181 | */ | 195 | */ |
182 | minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); | 196 | minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); |
@@ -186,10 +200,10 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) | |||
186 | maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); | 200 | maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); |
187 | maxforkoff = maxforkoff >> 3; /* rounded down */ | 201 | maxforkoff = maxforkoff >> 3; /* rounded down */ |
188 | 202 | ||
189 | if (offset >= minforkoff && offset < maxforkoff) | ||
190 | return offset; | ||
191 | if (offset >= maxforkoff) | 203 | if (offset >= maxforkoff) |
192 | return maxforkoff; | 204 | return maxforkoff; |
205 | if (offset >= minforkoff) | ||
206 | return offset; | ||
193 | return 0; | 207 | return 0; |
194 | } | 208 | } |
195 | 209 | ||
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index c68baeb0974..d0ab7883705 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c | |||
@@ -2383,6 +2383,8 @@ xfs_bmap_btalloc( | |||
2383 | int tryagain; | 2383 | int tryagain; |
2384 | int error; | 2384 | int error; |
2385 | 2385 | ||
2386 | ASSERT(ap->length); | ||
2387 | |||
2386 | mp = ap->ip->i_mount; | 2388 | mp = ap->ip->i_mount; |
2387 | align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; | 2389 | align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; |
2388 | if (unlikely(align)) { | 2390 | if (unlikely(align)) { |
@@ -4629,6 +4631,8 @@ xfs_bmapi_allocate( | |||
4629 | int error; | 4631 | int error; |
4630 | int rt; | 4632 | int rt; |
4631 | 4633 | ||
4634 | ASSERT(bma->length > 0); | ||
4635 | |||
4632 | rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); | 4636 | rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip); |
4633 | 4637 | ||
4634 | /* | 4638 | /* |
@@ -4849,6 +4853,7 @@ xfs_bmapi_write( | |||
4849 | ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); | 4853 | ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); |
4850 | ASSERT(!(flags & XFS_BMAPI_IGSTATE)); | 4854 | ASSERT(!(flags & XFS_BMAPI_IGSTATE)); |
4851 | ASSERT(tp != NULL); | 4855 | ASSERT(tp != NULL); |
4856 | ASSERT(len > 0); | ||
4852 | 4857 | ||
4853 | whichfork = (flags & XFS_BMAPI_ATTRFORK) ? | 4858 | whichfork = (flags & XFS_BMAPI_ATTRFORK) ? |
4854 | XFS_ATTR_FORK : XFS_DATA_FORK; | 4859 | XFS_ATTR_FORK : XFS_DATA_FORK; |
@@ -4918,9 +4923,22 @@ xfs_bmapi_write( | |||
4918 | bma.eof = eof; | 4923 | bma.eof = eof; |
4919 | bma.conv = !!(flags & XFS_BMAPI_CONVERT); | 4924 | bma.conv = !!(flags & XFS_BMAPI_CONVERT); |
4920 | bma.wasdel = wasdelay; | 4925 | bma.wasdel = wasdelay; |
4921 | bma.length = len; | ||
4922 | bma.offset = bno; | 4926 | bma.offset = bno; |
4923 | 4927 | ||
4928 | /* | ||
4929 | * There's a 32/64 bit type mismatch between the | ||
4930 | * allocation length request (which can be 64 bits in | ||
4931 | * length) and the bma length request, which is | ||
4932 | * xfs_extlen_t and therefore 32 bits. Hence we have to | ||
4933 | * check for 32-bit overflows and handle them here. | ||
4934 | */ | ||
4935 | if (len > (xfs_filblks_t)MAXEXTLEN) | ||
4936 | bma.length = MAXEXTLEN; | ||
4937 | else | ||
4938 | bma.length = len; | ||
4939 | |||
4940 | ASSERT(len > 0); | ||
4941 | ASSERT(bma.length > 0); | ||
4924 | error = xfs_bmapi_allocate(&bma, flags); | 4942 | error = xfs_bmapi_allocate(&bma, flags); |
4925 | if (error) | 4943 | if (error) |
4926 | goto error0; | 4944 | goto error0; |
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1a3513881bc..eac97ef81e2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c | |||
@@ -656,7 +656,7 @@ xfs_buf_item_committing( | |||
656 | /* | 656 | /* |
657 | * This is the ops vector shared by all buf log items. | 657 | * This is the ops vector shared by all buf log items. |
658 | */ | 658 | */ |
659 | static struct xfs_item_ops xfs_buf_item_ops = { | 659 | static const struct xfs_item_ops xfs_buf_item_ops = { |
660 | .iop_size = xfs_buf_item_size, | 660 | .iop_size = xfs_buf_item_size, |
661 | .iop_format = xfs_buf_item_format, | 661 | .iop_format = xfs_buf_item_format, |
662 | .iop_pin = xfs_buf_item_pin, | 662 | .iop_pin = xfs_buf_item_pin, |
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d236d..0dee0b71029 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c | |||
@@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing( | |||
295 | /* | 295 | /* |
296 | * This is the ops vector for dquots | 296 | * This is the ops vector for dquots |
297 | */ | 297 | */ |
298 | static struct xfs_item_ops xfs_dquot_item_ops = { | 298 | static const struct xfs_item_ops xfs_dquot_item_ops = { |
299 | .iop_size = xfs_qm_dquot_logitem_size, | 299 | .iop_size = xfs_qm_dquot_logitem_size, |
300 | .iop_format = xfs_qm_dquot_logitem_format, | 300 | .iop_format = xfs_qm_dquot_logitem_format, |
301 | .iop_pin = xfs_qm_dquot_logitem_pin, | 301 | .iop_pin = xfs_qm_dquot_logitem_pin, |
@@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing( | |||
483 | { | 483 | { |
484 | } | 484 | } |
485 | 485 | ||
486 | static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { | 486 | static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { |
487 | .iop_size = xfs_qm_qoff_logitem_size, | 487 | .iop_size = xfs_qm_qoff_logitem_size, |
488 | .iop_format = xfs_qm_qoff_logitem_format, | 488 | .iop_format = xfs_qm_qoff_logitem_format, |
489 | .iop_pin = xfs_qm_qoff_logitem_pin, | 489 | .iop_pin = xfs_qm_qoff_logitem_pin, |
@@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { | |||
498 | /* | 498 | /* |
499 | * This is the ops vector shared by all quotaoff-start log items. | 499 | * This is the ops vector shared by all quotaoff-start log items. |
500 | */ | 500 | */ |
501 | static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { | 501 | static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { |
502 | .iop_size = xfs_qm_qoff_logitem_size, | 502 | .iop_size = xfs_qm_qoff_logitem_size, |
503 | .iop_format = xfs_qm_qoff_logitem_format, | 503 | .iop_format = xfs_qm_qoff_logitem_format, |
504 | .iop_pin = xfs_qm_qoff_logitem_pin, | 504 | .iop_pin = xfs_qm_qoff_logitem_pin, |
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index da108977b21..558910f5e3c 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c | |||
@@ -98,22 +98,22 @@ xfs_fs_encode_fh( | |||
98 | switch (fileid_type) { | 98 | switch (fileid_type) { |
99 | case FILEID_INO32_GEN_PARENT: | 99 | case FILEID_INO32_GEN_PARENT: |
100 | spin_lock(&dentry->d_lock); | 100 | spin_lock(&dentry->d_lock); |
101 | fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino; | 101 | fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; |
102 | fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; | 102 | fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation; |
103 | spin_unlock(&dentry->d_lock); | 103 | spin_unlock(&dentry->d_lock); |
104 | /*FALLTHRU*/ | 104 | /*FALLTHRU*/ |
105 | case FILEID_INO32_GEN: | 105 | case FILEID_INO32_GEN: |
106 | fid->i32.ino = inode->i_ino; | 106 | fid->i32.ino = XFS_I(inode)->i_ino; |
107 | fid->i32.gen = inode->i_generation; | 107 | fid->i32.gen = inode->i_generation; |
108 | break; | 108 | break; |
109 | case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: | 109 | case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: |
110 | spin_lock(&dentry->d_lock); | 110 | spin_lock(&dentry->d_lock); |
111 | fid64->parent_ino = dentry->d_parent->d_inode->i_ino; | 111 | fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino; |
112 | fid64->parent_gen = dentry->d_parent->d_inode->i_generation; | 112 | fid64->parent_gen = dentry->d_parent->d_inode->i_generation; |
113 | spin_unlock(&dentry->d_lock); | 113 | spin_unlock(&dentry->d_lock); |
114 | /*FALLTHRU*/ | 114 | /*FALLTHRU*/ |
115 | case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: | 115 | case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: |
116 | fid64->ino = inode->i_ino; | 116 | fid64->ino = XFS_I(inode)->i_ino; |
117 | fid64->gen = inode->i_generation; | 117 | fid64->gen = inode->i_generation; |
118 | break; | 118 | break; |
119 | } | 119 | } |
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e6262343..35c2aff38b2 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c | |||
@@ -217,7 +217,7 @@ xfs_efi_item_committing( | |||
217 | /* | 217 | /* |
218 | * This is the ops vector shared by all efi log items. | 218 | * This is the ops vector shared by all efi log items. |
219 | */ | 219 | */ |
220 | static struct xfs_item_ops xfs_efi_item_ops = { | 220 | static const struct xfs_item_ops xfs_efi_item_ops = { |
221 | .iop_size = xfs_efi_item_size, | 221 | .iop_size = xfs_efi_item_size, |
222 | .iop_format = xfs_efi_item_format, | 222 | .iop_format = xfs_efi_item_format, |
223 | .iop_pin = xfs_efi_item_pin, | 223 | .iop_pin = xfs_efi_item_pin, |
@@ -477,7 +477,7 @@ xfs_efd_item_committing( | |||
477 | /* | 477 | /* |
478 | * This is the ops vector shared by all efd log items. | 478 | * This is the ops vector shared by all efd log items. |
479 | */ | 479 | */ |
480 | static struct xfs_item_ops xfs_efd_item_ops = { | 480 | static const struct xfs_item_ops xfs_efd_item_ops = { |
481 | .iop_size = xfs_efd_item_size, | 481 | .iop_size = xfs_efd_item_size, |
482 | .iop_format = xfs_efd_item_format, | 482 | .iop_format = xfs_efd_item_format, |
483 | .iop_pin = xfs_efd_item_pin, | 483 | .iop_pin = xfs_efd_item_pin, |
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0237c602f1..755ee816488 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -2835,6 +2835,27 @@ corrupt_out: | |||
2835 | return XFS_ERROR(EFSCORRUPTED); | 2835 | return XFS_ERROR(EFSCORRUPTED); |
2836 | } | 2836 | } |
2837 | 2837 | ||
2838 | void | ||
2839 | xfs_promote_inode( | ||
2840 | struct xfs_inode *ip) | ||
2841 | { | ||
2842 | struct xfs_buf *bp; | ||
2843 | |||
2844 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); | ||
2845 | |||
2846 | bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno, | ||
2847 | ip->i_imap.im_len, XBF_TRYLOCK); | ||
2848 | if (!bp) | ||
2849 | return; | ||
2850 | |||
2851 | if (XFS_BUF_ISDELAYWRITE(bp)) { | ||
2852 | xfs_buf_delwri_promote(bp); | ||
2853 | wake_up_process(ip->i_mount->m_ddev_targp->bt_task); | ||
2854 | } | ||
2855 | |||
2856 | xfs_buf_relse(bp); | ||
2857 | } | ||
2858 | |||
2838 | /* | 2859 | /* |
2839 | * Return a pointer to the extent record at file index idx. | 2860 | * Return a pointer to the extent record at file index idx. |
2840 | */ | 2861 | */ |
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 760140d1dd6..b4cd4739f98 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h | |||
@@ -498,6 +498,7 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); | |||
498 | void xfs_iext_realloc(xfs_inode_t *, int, int); | 498 | void xfs_iext_realloc(xfs_inode_t *, int, int); |
499 | void xfs_iunpin_wait(xfs_inode_t *); | 499 | void xfs_iunpin_wait(xfs_inode_t *); |
500 | int xfs_iflush(xfs_inode_t *, uint); | 500 | int xfs_iflush(xfs_inode_t *, uint); |
501 | void xfs_promote_inode(struct xfs_inode *); | ||
501 | void xfs_lock_inodes(xfs_inode_t **, int, uint); | 502 | void xfs_lock_inodes(xfs_inode_t **, int, uint); |
502 | void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); | 503 | void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); |
503 | 504 | ||
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b7cf21ba240..abaafdbb3e6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c | |||
@@ -795,7 +795,7 @@ xfs_inode_item_committing( | |||
795 | /* | 795 | /* |
796 | * This is the ops vector shared by all buf log items. | 796 | * This is the ops vector shared by all buf log items. |
797 | */ | 797 | */ |
798 | static struct xfs_item_ops xfs_inode_item_ops = { | 798 | static const struct xfs_item_ops xfs_inode_item_ops = { |
799 | .iop_size = xfs_inode_item_size, | 799 | .iop_size = xfs_inode_item_size, |
800 | .iop_format = xfs_inode_item_format, | 800 | .iop_format = xfs_inode_item_format, |
801 | .iop_pin = xfs_inode_item_pin, | 801 | .iop_pin = xfs_inode_item_pin, |
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2758a6277c5..34817adf4b9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c | |||
@@ -150,6 +150,117 @@ xlog_grant_add_space( | |||
150 | } while (head_val != old); | 150 | } while (head_val != old); |
151 | } | 151 | } |
152 | 152 | ||
153 | STATIC bool | ||
154 | xlog_reserveq_wake( | ||
155 | struct log *log, | ||
156 | int *free_bytes) | ||
157 | { | ||
158 | struct xlog_ticket *tic; | ||
159 | int need_bytes; | ||
160 | |||
161 | list_for_each_entry(tic, &log->l_reserveq, t_queue) { | ||
162 | if (tic->t_flags & XLOG_TIC_PERM_RESERV) | ||
163 | need_bytes = tic->t_unit_res * tic->t_cnt; | ||
164 | else | ||
165 | need_bytes = tic->t_unit_res; | ||
166 | |||
167 | if (*free_bytes < need_bytes) | ||
168 | return false; | ||
169 | *free_bytes -= need_bytes; | ||
170 | |||
171 | trace_xfs_log_grant_wake_up(log, tic); | ||
172 | wake_up(&tic->t_wait); | ||
173 | } | ||
174 | |||
175 | return true; | ||
176 | } | ||
177 | |||
178 | STATIC bool | ||
179 | xlog_writeq_wake( | ||
180 | struct log *log, | ||
181 | int *free_bytes) | ||
182 | { | ||
183 | struct xlog_ticket *tic; | ||
184 | int need_bytes; | ||
185 | |||
186 | list_for_each_entry(tic, &log->l_writeq, t_queue) { | ||
187 | ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); | ||
188 | |||
189 | need_bytes = tic->t_unit_res; | ||
190 | |||
191 | if (*free_bytes < need_bytes) | ||
192 | return false; | ||
193 | *free_bytes -= need_bytes; | ||
194 | |||
195 | trace_xfs_log_regrant_write_wake_up(log, tic); | ||
196 | wake_up(&tic->t_wait); | ||
197 | } | ||
198 | |||
199 | return true; | ||
200 | } | ||
201 | |||
202 | STATIC int | ||
203 | xlog_reserveq_wait( | ||
204 | struct log *log, | ||
205 | struct xlog_ticket *tic, | ||
206 | int need_bytes) | ||
207 | { | ||
208 | list_add_tail(&tic->t_queue, &log->l_reserveq); | ||
209 | |||
210 | do { | ||
211 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
212 | goto shutdown; | ||
213 | xlog_grant_push_ail(log, need_bytes); | ||
214 | |||
215 | XFS_STATS_INC(xs_sleep_logspace); | ||
216 | trace_xfs_log_grant_sleep(log, tic); | ||
217 | |||
218 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
219 | trace_xfs_log_grant_wake(log, tic); | ||
220 | |||
221 | spin_lock(&log->l_grant_reserve_lock); | ||
222 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
223 | goto shutdown; | ||
224 | } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes); | ||
225 | |||
226 | list_del_init(&tic->t_queue); | ||
227 | return 0; | ||
228 | shutdown: | ||
229 | list_del_init(&tic->t_queue); | ||
230 | return XFS_ERROR(EIO); | ||
231 | } | ||
232 | |||
233 | STATIC int | ||
234 | xlog_writeq_wait( | ||
235 | struct log *log, | ||
236 | struct xlog_ticket *tic, | ||
237 | int need_bytes) | ||
238 | { | ||
239 | list_add_tail(&tic->t_queue, &log->l_writeq); | ||
240 | |||
241 | do { | ||
242 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
243 | goto shutdown; | ||
244 | xlog_grant_push_ail(log, need_bytes); | ||
245 | |||
246 | XFS_STATS_INC(xs_sleep_logspace); | ||
247 | trace_xfs_log_regrant_write_sleep(log, tic); | ||
248 | |||
249 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
250 | trace_xfs_log_regrant_write_wake(log, tic); | ||
251 | |||
252 | spin_lock(&log->l_grant_write_lock); | ||
253 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
254 | goto shutdown; | ||
255 | } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes); | ||
256 | |||
257 | list_del_init(&tic->t_queue); | ||
258 | return 0; | ||
259 | shutdown: | ||
260 | list_del_init(&tic->t_queue); | ||
261 | return XFS_ERROR(EIO); | ||
262 | } | ||
263 | |||
153 | static void | 264 | static void |
154 | xlog_tic_reset_res(xlog_ticket_t *tic) | 265 | xlog_tic_reset_res(xlog_ticket_t *tic) |
155 | { | 266 | { |
@@ -350,8 +461,19 @@ xfs_log_reserve( | |||
350 | retval = xlog_grant_log_space(log, internal_ticket); | 461 | retval = xlog_grant_log_space(log, internal_ticket); |
351 | } | 462 | } |
352 | 463 | ||
464 | if (unlikely(retval)) { | ||
465 | /* | ||
466 | * If we are failing, make sure the ticket doesn't have any | ||
467 | * current reservations. We don't want to add this back | ||
468 | * when the ticket/ transaction gets cancelled. | ||
469 | */ | ||
470 | internal_ticket->t_curr_res = 0; | ||
471 | /* ungrant will give back unit_res * t_cnt. */ | ||
472 | internal_ticket->t_cnt = 0; | ||
473 | } | ||
474 | |||
353 | return retval; | 475 | return retval; |
354 | } /* xfs_log_reserve */ | 476 | } |
355 | 477 | ||
356 | 478 | ||
357 | /* | 479 | /* |
@@ -626,7 +748,7 @@ xfs_log_item_init( | |||
626 | struct xfs_mount *mp, | 748 | struct xfs_mount *mp, |
627 | struct xfs_log_item *item, | 749 | struct xfs_log_item *item, |
628 | int type, | 750 | int type, |
629 | struct xfs_item_ops *ops) | 751 | const struct xfs_item_ops *ops) |
630 | { | 752 | { |
631 | item->li_mountp = mp; | 753 | item->li_mountp = mp; |
632 | item->li_ailp = mp->m_ail; | 754 | item->li_ailp = mp->m_ail; |
@@ -2481,8 +2603,8 @@ restart: | |||
2481 | /* | 2603 | /* |
2482 | * Atomically get the log space required for a log ticket. | 2604 | * Atomically get the log space required for a log ticket. |
2483 | * | 2605 | * |
2484 | * Once a ticket gets put onto the reserveq, it will only return after | 2606 | * Once a ticket gets put onto the reserveq, it will only return after the |
2485 | * the needed reservation is satisfied. | 2607 | * needed reservation is satisfied. |
2486 | * | 2608 | * |
2487 | * This function is structured so that it has a lock free fast path. This is | 2609 | * This function is structured so that it has a lock free fast path. This is |
2488 | * necessary because every new transaction reservation will come through this | 2610 | * necessary because every new transaction reservation will come through this |
@@ -2490,113 +2612,53 @@ restart: | |||
2490 | * every pass. | 2612 | * every pass. |
2491 | * | 2613 | * |
2492 | * As tickets are only ever moved on and off the reserveq under the | 2614 | * As tickets are only ever moved on and off the reserveq under the |
2493 | * l_grant_reserve_lock, we only need to take that lock if we are going | 2615 | * l_grant_reserve_lock, we only need to take that lock if we are going to add |
2494 | * to add the ticket to the queue and sleep. We can avoid taking the lock if the | 2616 | * the ticket to the queue and sleep. We can avoid taking the lock if the ticket |
2495 | * ticket was never added to the reserveq because the t_queue list head will be | 2617 | * was never added to the reserveq because the t_queue list head will be empty |
2496 | * empty and we hold the only reference to it so it can safely be checked | 2618 | * and we hold the only reference to it so it can safely be checked unlocked. |
2497 | * unlocked. | ||
2498 | */ | 2619 | */ |
2499 | STATIC int | 2620 | STATIC int |
2500 | xlog_grant_log_space(xlog_t *log, | 2621 | xlog_grant_log_space( |
2501 | xlog_ticket_t *tic) | 2622 | struct log *log, |
2623 | struct xlog_ticket *tic) | ||
2502 | { | 2624 | { |
2503 | int free_bytes; | 2625 | int free_bytes, need_bytes; |
2504 | int need_bytes; | 2626 | int error = 0; |
2505 | 2627 | ||
2506 | #ifdef DEBUG | 2628 | ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); |
2507 | if (log->l_flags & XLOG_ACTIVE_RECOVERY) | ||
2508 | panic("grant Recovery problem"); | ||
2509 | #endif | ||
2510 | 2629 | ||
2511 | trace_xfs_log_grant_enter(log, tic); | 2630 | trace_xfs_log_grant_enter(log, tic); |
2512 | 2631 | ||
2632 | /* | ||
2633 | * If there are other waiters on the queue then give them a chance at | ||
2634 | * logspace before us. Wake up the first waiters, if we do not wake | ||
2635 | * up all the waiters then go to sleep waiting for more free space, | ||
2636 | * otherwise try to get some space for this transaction. | ||
2637 | */ | ||
2513 | need_bytes = tic->t_unit_res; | 2638 | need_bytes = tic->t_unit_res; |
2514 | if (tic->t_flags & XFS_LOG_PERM_RESERV) | 2639 | if (tic->t_flags & XFS_LOG_PERM_RESERV) |
2515 | need_bytes *= tic->t_ocnt; | 2640 | need_bytes *= tic->t_ocnt; |
2516 | |||
2517 | /* something is already sleeping; insert new transaction at end */ | ||
2518 | if (!list_empty_careful(&log->l_reserveq)) { | ||
2519 | spin_lock(&log->l_grant_reserve_lock); | ||
2520 | /* recheck the queue now we are locked */ | ||
2521 | if (list_empty(&log->l_reserveq)) { | ||
2522 | spin_unlock(&log->l_grant_reserve_lock); | ||
2523 | goto redo; | ||
2524 | } | ||
2525 | list_add_tail(&tic->t_queue, &log->l_reserveq); | ||
2526 | |||
2527 | trace_xfs_log_grant_sleep1(log, tic); | ||
2528 | |||
2529 | /* | ||
2530 | * Gotta check this before going to sleep, while we're | ||
2531 | * holding the grant lock. | ||
2532 | */ | ||
2533 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2534 | goto error_return; | ||
2535 | |||
2536 | XFS_STATS_INC(xs_sleep_logspace); | ||
2537 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
2538 | |||
2539 | /* | ||
2540 | * If we got an error, and the filesystem is shutting down, | ||
2541 | * we'll catch it down below. So just continue... | ||
2542 | */ | ||
2543 | trace_xfs_log_grant_wake1(log, tic); | ||
2544 | } | ||
2545 | |||
2546 | redo: | ||
2547 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2548 | goto error_return_unlocked; | ||
2549 | |||
2550 | free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); | 2641 | free_bytes = xlog_space_left(log, &log->l_grant_reserve_head); |
2551 | if (free_bytes < need_bytes) { | 2642 | if (!list_empty_careful(&log->l_reserveq)) { |
2552 | spin_lock(&log->l_grant_reserve_lock); | 2643 | spin_lock(&log->l_grant_reserve_lock); |
2553 | if (list_empty(&tic->t_queue)) | 2644 | if (!xlog_reserveq_wake(log, &free_bytes) || |
2554 | list_add_tail(&tic->t_queue, &log->l_reserveq); | 2645 | free_bytes < need_bytes) |
2555 | 2646 | error = xlog_reserveq_wait(log, tic, need_bytes); | |
2556 | trace_xfs_log_grant_sleep2(log, tic); | 2647 | spin_unlock(&log->l_grant_reserve_lock); |
2557 | 2648 | } else if (free_bytes < need_bytes) { | |
2558 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2559 | goto error_return; | ||
2560 | |||
2561 | xlog_grant_push_ail(log, need_bytes); | ||
2562 | |||
2563 | XFS_STATS_INC(xs_sleep_logspace); | ||
2564 | xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock); | ||
2565 | |||
2566 | trace_xfs_log_grant_wake2(log, tic); | ||
2567 | goto redo; | ||
2568 | } | ||
2569 | |||
2570 | if (!list_empty(&tic->t_queue)) { | ||
2571 | spin_lock(&log->l_grant_reserve_lock); | 2649 | spin_lock(&log->l_grant_reserve_lock); |
2572 | list_del_init(&tic->t_queue); | 2650 | error = xlog_reserveq_wait(log, tic, need_bytes); |
2573 | spin_unlock(&log->l_grant_reserve_lock); | 2651 | spin_unlock(&log->l_grant_reserve_lock); |
2574 | } | 2652 | } |
2653 | if (error) | ||
2654 | return error; | ||
2575 | 2655 | ||
2576 | /* we've got enough space */ | ||
2577 | xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); | 2656 | xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes); |
2578 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); | 2657 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); |
2579 | trace_xfs_log_grant_exit(log, tic); | 2658 | trace_xfs_log_grant_exit(log, tic); |
2580 | xlog_verify_grant_tail(log); | 2659 | xlog_verify_grant_tail(log); |
2581 | return 0; | 2660 | return 0; |
2582 | 2661 | } | |
2583 | error_return_unlocked: | ||
2584 | spin_lock(&log->l_grant_reserve_lock); | ||
2585 | error_return: | ||
2586 | list_del_init(&tic->t_queue); | ||
2587 | spin_unlock(&log->l_grant_reserve_lock); | ||
2588 | trace_xfs_log_grant_error(log, tic); | ||
2589 | |||
2590 | /* | ||
2591 | * If we are failing, make sure the ticket doesn't have any | ||
2592 | * current reservations. We don't want to add this back when | ||
2593 | * the ticket/transaction gets cancelled. | ||
2594 | */ | ||
2595 | tic->t_curr_res = 0; | ||
2596 | tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ | ||
2597 | return XFS_ERROR(EIO); | ||
2598 | } /* xlog_grant_log_space */ | ||
2599 | |||
2600 | 2662 | ||
2601 | /* | 2663 | /* |
2602 | * Replenish the byte reservation required by moving the grant write head. | 2664 | * Replenish the byte reservation required by moving the grant write head. |
@@ -2605,10 +2667,12 @@ error_return: | |||
2605 | * free fast path. | 2667 | * free fast path. |
2606 | */ | 2668 | */ |
2607 | STATIC int | 2669 | STATIC int |
2608 | xlog_regrant_write_log_space(xlog_t *log, | 2670 | xlog_regrant_write_log_space( |
2609 | xlog_ticket_t *tic) | 2671 | struct log *log, |
2672 | struct xlog_ticket *tic) | ||
2610 | { | 2673 | { |
2611 | int free_bytes, need_bytes; | 2674 | int free_bytes, need_bytes; |
2675 | int error = 0; | ||
2612 | 2676 | ||
2613 | tic->t_curr_res = tic->t_unit_res; | 2677 | tic->t_curr_res = tic->t_unit_res; |
2614 | xlog_tic_reset_res(tic); | 2678 | xlog_tic_reset_res(tic); |
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t *log, | |||
2616 | if (tic->t_cnt > 0) | 2680 | if (tic->t_cnt > 0) |
2617 | return 0; | 2681 | return 0; |
2618 | 2682 | ||
2619 | #ifdef DEBUG | 2683 | ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); |
2620 | if (log->l_flags & XLOG_ACTIVE_RECOVERY) | ||
2621 | panic("regrant Recovery problem"); | ||
2622 | #endif | ||
2623 | 2684 | ||
2624 | trace_xfs_log_regrant_write_enter(log, tic); | 2685 | trace_xfs_log_regrant_write_enter(log, tic); |
2625 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2626 | goto error_return_unlocked; | ||
2627 | 2686 | ||
2628 | /* If there are other waiters on the queue then give them a | 2687 | /* |
2629 | * chance at logspace before us. Wake up the first waiters, | 2688 | * If there are other waiters on the queue then give them a chance at |
2630 | * if we do not wake up all the waiters then go to sleep waiting | 2689 | * logspace before us. Wake up the first waiters, if we do not wake |
2631 | * for more free space, otherwise try to get some space for | 2690 | * up all the waiters then go to sleep waiting for more free space, |
2632 | * this transaction. | 2691 | * otherwise try to get some space for this transaction. |
2633 | */ | 2692 | */ |
2634 | need_bytes = tic->t_unit_res; | 2693 | need_bytes = tic->t_unit_res; |
2635 | if (!list_empty_careful(&log->l_writeq)) { | ||
2636 | struct xlog_ticket *ntic; | ||
2637 | |||
2638 | spin_lock(&log->l_grant_write_lock); | ||
2639 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); | ||
2640 | list_for_each_entry(ntic, &log->l_writeq, t_queue) { | ||
2641 | ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); | ||
2642 | |||
2643 | if (free_bytes < ntic->t_unit_res) | ||
2644 | break; | ||
2645 | free_bytes -= ntic->t_unit_res; | ||
2646 | wake_up(&ntic->t_wait); | ||
2647 | } | ||
2648 | |||
2649 | if (ntic != list_first_entry(&log->l_writeq, | ||
2650 | struct xlog_ticket, t_queue)) { | ||
2651 | if (list_empty(&tic->t_queue)) | ||
2652 | list_add_tail(&tic->t_queue, &log->l_writeq); | ||
2653 | trace_xfs_log_regrant_write_sleep1(log, tic); | ||
2654 | |||
2655 | xlog_grant_push_ail(log, need_bytes); | ||
2656 | |||
2657 | XFS_STATS_INC(xs_sleep_logspace); | ||
2658 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
2659 | trace_xfs_log_regrant_write_wake1(log, tic); | ||
2660 | } else | ||
2661 | spin_unlock(&log->l_grant_write_lock); | ||
2662 | } | ||
2663 | |||
2664 | redo: | ||
2665 | if (XLOG_FORCED_SHUTDOWN(log)) | ||
2666 | goto error_return_unlocked; | ||
2667 | |||
2668 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); | 2694 | free_bytes = xlog_space_left(log, &log->l_grant_write_head); |
2669 | if (free_bytes < need_bytes) { | 2695 | if (!list_empty_careful(&log->l_writeq)) { |
2670 | spin_lock(&log->l_grant_write_lock); | 2696 | spin_lock(&log->l_grant_write_lock); |
2671 | if (list_empty(&tic->t_queue)) | 2697 | if (!xlog_writeq_wake(log, &free_bytes) || |
2672 | list_add_tail(&tic->t_queue, &log->l_writeq); | 2698 | free_bytes < need_bytes) |
2673 | 2699 | error = xlog_writeq_wait(log, tic, need_bytes); | |
2674 | if (XLOG_FORCED_SHUTDOWN(log)) | 2700 | spin_unlock(&log->l_grant_write_lock); |
2675 | goto error_return; | 2701 | } else if (free_bytes < need_bytes) { |
2676 | |||
2677 | xlog_grant_push_ail(log, need_bytes); | ||
2678 | |||
2679 | XFS_STATS_INC(xs_sleep_logspace); | ||
2680 | trace_xfs_log_regrant_write_sleep2(log, tic); | ||
2681 | xlog_wait(&tic->t_wait, &log->l_grant_write_lock); | ||
2682 | |||
2683 | trace_xfs_log_regrant_write_wake2(log, tic); | ||
2684 | goto redo; | ||
2685 | } | ||
2686 | |||
2687 | if (!list_empty(&tic->t_queue)) { | ||
2688 | spin_lock(&log->l_grant_write_lock); | 2702 | spin_lock(&log->l_grant_write_lock); |
2689 | list_del_init(&tic->t_queue); | 2703 | error = xlog_writeq_wait(log, tic, need_bytes); |
2690 | spin_unlock(&log->l_grant_write_lock); | 2704 | spin_unlock(&log->l_grant_write_lock); |
2691 | } | 2705 | } |
2692 | 2706 | ||
2693 | /* we've got enough space */ | 2707 | if (error) |
2708 | return error; | ||
2709 | |||
2694 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); | 2710 | xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes); |
2695 | trace_xfs_log_regrant_write_exit(log, tic); | 2711 | trace_xfs_log_regrant_write_exit(log, tic); |
2696 | xlog_verify_grant_tail(log); | 2712 | xlog_verify_grant_tail(log); |
2697 | return 0; | 2713 | return 0; |
2698 | 2714 | } | |
2699 | |||
2700 | error_return_unlocked: | ||
2701 | spin_lock(&log->l_grant_write_lock); | ||
2702 | error_return: | ||
2703 | list_del_init(&tic->t_queue); | ||
2704 | spin_unlock(&log->l_grant_write_lock); | ||
2705 | trace_xfs_log_regrant_write_error(log, tic); | ||
2706 | |||
2707 | /* | ||
2708 | * If we are failing, make sure the ticket doesn't have any | ||
2709 | * current reservations. We don't want to add this back when | ||
2710 | * the ticket/transaction gets cancelled. | ||
2711 | */ | ||
2712 | tic->t_curr_res = 0; | ||
2713 | tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ | ||
2714 | return XFS_ERROR(EIO); | ||
2715 | } /* xlog_regrant_write_log_space */ | ||
2716 | |||
2717 | 2715 | ||
2718 | /* The first cnt-1 times through here we don't need to | 2716 | /* The first cnt-1 times through here we don't need to |
2719 | * move the grant write head because the permanent | 2717 | * move the grant write head because the permanent |
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039994a..3f7bf451c03 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h | |||
@@ -137,7 +137,7 @@ struct xfs_trans; | |||
137 | void xfs_log_item_init(struct xfs_mount *mp, | 137 | void xfs_log_item_init(struct xfs_mount *mp, |
138 | struct xfs_log_item *item, | 138 | struct xfs_log_item *item, |
139 | int type, | 139 | int type, |
140 | struct xfs_item_ops *ops); | 140 | const struct xfs_item_ops *ops); |
141 | 141 | ||
142 | xfs_lsn_t xfs_log_done(struct xfs_mount *mp, | 142 | xfs_lsn_t xfs_log_done(struct xfs_mount *mp, |
143 | struct xlog_ticket *ticket, | 143 | struct xlog_ticket *ticket, |
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5cff443f6cd..0bbb1a41998 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c | |||
@@ -674,7 +674,8 @@ xfs_qm_dqattach_one( | |||
674 | * disk and we didn't ask it to allocate; | 674 | * disk and we didn't ask it to allocate; |
675 | * ESRCH if quotas got turned off suddenly. | 675 | * ESRCH if quotas got turned off suddenly. |
676 | */ | 676 | */ |
677 | error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); | 677 | error = xfs_qm_dqget(ip->i_mount, ip, id, type, |
678 | doalloc | XFS_QMOPT_DOWARN, &dqp); | ||
678 | if (error) | 679 | if (error) |
679 | return error; | 680 | return error; |
680 | 681 | ||
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index aa3dc1a4d53..be5c51d8f75 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c | |||
@@ -770,6 +770,17 @@ restart: | |||
770 | if (!xfs_iflock_nowait(ip)) { | 770 | if (!xfs_iflock_nowait(ip)) { |
771 | if (!(sync_mode & SYNC_WAIT)) | 771 | if (!(sync_mode & SYNC_WAIT)) |
772 | goto out; | 772 | goto out; |
773 | |||
774 | /* | ||
775 | * If we only have a single dirty inode in a cluster there is | ||
776 | * a fair chance that the AIL push may have pushed it into | ||
777 | * the buffer, but xfsbufd won't touch it until 30 seconds | ||
778 | * from now, and thus we will lock up here. | ||
779 | * | ||
780 | * Promote the inode buffer to the front of the delwri list | ||
781 | * and wake up xfsbufd now. | ||
782 | */ | ||
783 | xfs_promote_inode(ip); | ||
773 | xfs_iflock(ip); | 784 | xfs_iflock(ip); |
774 | } | 785 | } |
775 | 786 | ||
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f1d2802b2f0..49403579887 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h | |||
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); | |||
834 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); | 834 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter); |
835 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); | 835 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit); |
836 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); | 836 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_error); |
837 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); | 837 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); |
838 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); | 838 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); |
839 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); | ||
840 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); | ||
841 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); | 839 | DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); |
842 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); | 840 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); |
843 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); | 841 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); |
844 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); | 842 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); |
845 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); | 843 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep); |
846 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); | 844 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake); |
847 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); | ||
848 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); | ||
849 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); | 845 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up); |
850 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); | 846 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); |
851 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); | 847 | DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); |
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 603f3eb5204..3ae713c0abd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h | |||
@@ -326,7 +326,7 @@ typedef struct xfs_log_item { | |||
326 | struct xfs_log_item *); | 326 | struct xfs_log_item *); |
327 | /* buffer item iodone */ | 327 | /* buffer item iodone */ |
328 | /* callback func */ | 328 | /* callback func */ |
329 | struct xfs_item_ops *li_ops; /* function list */ | 329 | const struct xfs_item_ops *li_ops; /* function list */ |
330 | 330 | ||
331 | /* delayed logging */ | 331 | /* delayed logging */ |
332 | struct list_head li_cil; /* CIL pointers */ | 332 | struct list_head li_cil; /* CIL pointers */ |
@@ -341,7 +341,7 @@ typedef struct xfs_log_item { | |||
341 | { XFS_LI_IN_AIL, "IN_AIL" }, \ | 341 | { XFS_LI_IN_AIL, "IN_AIL" }, \ |
342 | { XFS_LI_ABORTED, "ABORTED" } | 342 | { XFS_LI_ABORTED, "ABORTED" } |
343 | 343 | ||
344 | typedef struct xfs_item_ops { | 344 | struct xfs_item_ops { |
345 | uint (*iop_size)(xfs_log_item_t *); | 345 | uint (*iop_size)(xfs_log_item_t *); |
346 | void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); | 346 | void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); |
347 | void (*iop_pin)(xfs_log_item_t *); | 347 | void (*iop_pin)(xfs_log_item_t *); |
@@ -352,7 +352,7 @@ typedef struct xfs_item_ops { | |||
352 | void (*iop_push)(xfs_log_item_t *); | 352 | void (*iop_push)(xfs_log_item_t *); |
353 | bool (*iop_pushbuf)(xfs_log_item_t *); | 353 | bool (*iop_pushbuf)(xfs_log_item_t *); |
354 | void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); | 354 | void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); |
355 | } xfs_item_ops_t; | 355 | }; |
356 | 356 | ||
357 | #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) | 357 | #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) |
358 | #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) | 358 | #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) |
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4ecf2a54906..ce9268a2f56 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c | |||
@@ -112,7 +112,7 @@ xfs_readlink( | |||
112 | char *link) | 112 | char *link) |
113 | { | 113 | { |
114 | xfs_mount_t *mp = ip->i_mount; | 114 | xfs_mount_t *mp = ip->i_mount; |
115 | int pathlen; | 115 | xfs_fsize_t pathlen; |
116 | int error = 0; | 116 | int error = 0; |
117 | 117 | ||
118 | trace_xfs_readlink(ip); | 118 | trace_xfs_readlink(ip); |
@@ -122,13 +122,19 @@ xfs_readlink( | |||
122 | 122 | ||
123 | xfs_ilock(ip, XFS_ILOCK_SHARED); | 123 | xfs_ilock(ip, XFS_ILOCK_SHARED); |
124 | 124 | ||
125 | ASSERT(S_ISLNK(ip->i_d.di_mode)); | ||
126 | ASSERT(ip->i_d.di_size <= MAXPATHLEN); | ||
127 | |||
128 | pathlen = ip->i_d.di_size; | 125 | pathlen = ip->i_d.di_size; |
129 | if (!pathlen) | 126 | if (!pathlen) |
130 | goto out; | 127 | goto out; |
131 | 128 | ||
129 | if (pathlen < 0 || pathlen > MAXPATHLEN) { | ||
130 | xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", | ||
131 | __func__, (unsigned long long) ip->i_ino, | ||
132 | (long long) pathlen); | ||
133 | ASSERT(0); | ||
134 | return XFS_ERROR(EFSCORRUPTED); | ||
135 | } | ||
136 | |||
137 | |||
132 | if (ip->i_df.if_flags & XFS_IFINLINE) { | 138 | if (ip->i_df.if_flags & XFS_IFINLINE) { |
133 | memcpy(link, ip->i_df.if_u1.if_data, pathlen); | 139 | memcpy(link, ip->i_df.if_u1.if_data, pathlen); |
134 | link[pathlen] = '\0'; | 140 | link[pathlen] = '\0'; |