diff options
author | Jim Schutt <jaschut@sandia.gov> | 2013-05-15 14:03:35 -0400 |
---|---|---|
committer | Alex Elder <elder@inktank.com> | 2013-05-17 13:45:48 -0400 |
commit | 39be95e9c8c0b5668c9f8806ffe29bf9f4bc0f40 (patch) | |
tree | 580f3e90a2175e7c9173cce615696c9972df4982 /fs | |
parent | c420276a532a10ef59849adc2681f45306166b89 (diff) |
ceph: ceph_pagelist_append might sleep while atomic
Ceph's encode_caps_cb() worked hard to not call __page_cache_alloc()
while holding a lock, but it's spoiled because ceph_pagelist_addpage()
always calls kmap(), which might sleep. Here's the result:
[13439.295457] ceph: mds0 reconnect start
[13439.300572] BUG: sleeping function called from invalid context at include/linux/highmem.h:58
[13439.309243] in_atomic(): 1, irqs_disabled(): 0, pid: 12059, name: kworker/1:1
. . .
[13439.376225] Call Trace:
[13439.378757] [<ffffffff81076f4c>] __might_sleep+0xfc/0x110
[13439.384353] [<ffffffffa03f4ce0>] ceph_pagelist_append+0x120/0x1b0 [libceph]
[13439.391491] [<ffffffffa0448fe9>] ceph_encode_locks+0x89/0x190 [ceph]
[13439.398035] [<ffffffff814ee849>] ? _raw_spin_lock+0x49/0x50
[13439.403775] [<ffffffff811cadf5>] ? lock_flocks+0x15/0x20
[13439.409277] [<ffffffffa045e2af>] encode_caps_cb+0x41f/0x4a0 [ceph]
[13439.415622] [<ffffffff81196748>] ? igrab+0x28/0x70
[13439.420610] [<ffffffffa045e9f8>] ? iterate_session_caps+0xe8/0x250 [ceph]
[13439.427584] [<ffffffffa045ea25>] iterate_session_caps+0x115/0x250 [ceph]
[13439.434499] [<ffffffffa045de90>] ? set_request_path_attr+0x2d0/0x2d0 [ceph]
[13439.441646] [<ffffffffa0462888>] send_mds_reconnect+0x238/0x450 [ceph]
[13439.448363] [<ffffffffa0464542>] ? ceph_mdsmap_decode+0x5e2/0x770 [ceph]
[13439.455250] [<ffffffffa0462e42>] check_new_map+0x352/0x500 [ceph]
[13439.461534] [<ffffffffa04631ad>] ceph_mdsc_handle_map+0x1bd/0x260 [ceph]
[13439.468432] [<ffffffff814ebc7e>] ? mutex_unlock+0xe/0x10
[13439.473934] [<ffffffffa043c612>] extra_mon_dispatch+0x22/0x30 [ceph]
[13439.480464] [<ffffffffa03f6c2c>] dispatch+0xbc/0x110 [libceph]
[13439.486492] [<ffffffffa03eec3d>] process_message+0x1ad/0x1d0 [libceph]
[13439.493190] [<ffffffffa03f1498>] ? read_partial_message+0x3e8/0x520 [libceph]
. . .
[13439.587132] ceph: mds0 reconnect success
[13490.720032] ceph: mds0 caps stale
[13501.235257] ceph: mds0 recovery completed
[13501.300419] ceph: mds0 caps renewed
Fix it up by encoding locks into a buffer first, and when the number
of encoded locks is stable, copy that into a ceph_pagelist.
[elder@inktank.com: abbreviated the stack info a bit.]
Cc: stable@vger.kernel.org # 3.4+
Signed-off-by: Jim Schutt <jaschut@sandia.gov>
Reviewed-by: Alex Elder <elder@inktank.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ceph/locks.c | 76 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 65 | ||||
-rw-r--r-- | fs/ceph/super.h | 9 |
3 files changed, 89 insertions, 61 deletions
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index a80ed18d64ff..ebbf680378e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c | |||
@@ -191,29 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) | |||
191 | } | 191 | } |
192 | 192 | ||
193 | /** | 193 | /** |
194 | * Encode the flock and fcntl locks for the given inode into the pagelist. | 194 | * Encode the flock and fcntl locks for the given inode into the ceph_filelock |
195 | * Format is: #fcntl locks, sequential fcntl locks, #flock locks, | 195 | * array. Must be called with lock_flocks() already held. |
196 | * sequential flock locks. | 196 | * If we encounter more of a specific lock type than expected, return -ENOSPC. |
197 | * Must be called with lock_flocks() already held. | ||
198 | * If we encounter more of a specific lock type than expected, | ||
199 | * we return the value 1. | ||
200 | */ | 197 | */ |
201 | int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | 198 | int ceph_encode_locks_to_buffer(struct inode *inode, |
202 | int num_fcntl_locks, int num_flock_locks) | 199 | struct ceph_filelock *flocks, |
200 | int num_fcntl_locks, int num_flock_locks) | ||
203 | { | 201 | { |
204 | struct file_lock *lock; | 202 | struct file_lock *lock; |
205 | struct ceph_filelock cephlock; | ||
206 | int err = 0; | 203 | int err = 0; |
207 | int seen_fcntl = 0; | 204 | int seen_fcntl = 0; |
208 | int seen_flock = 0; | 205 | int seen_flock = 0; |
209 | __le32 nlocks; | 206 | int l = 0; |
210 | 207 | ||
211 | dout("encoding %d flock and %d fcntl locks", num_flock_locks, | 208 | dout("encoding %d flock and %d fcntl locks", num_flock_locks, |
212 | num_fcntl_locks); | 209 | num_fcntl_locks); |
213 | nlocks = cpu_to_le32(num_fcntl_locks); | 210 | |
214 | err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | ||
215 | if (err) | ||
216 | goto fail; | ||
217 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { | 211 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
218 | if (lock->fl_flags & FL_POSIX) { | 212 | if (lock->fl_flags & FL_POSIX) { |
219 | ++seen_fcntl; | 213 | ++seen_fcntl; |
@@ -221,20 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |||
221 | err = -ENOSPC; | 215 | err = -ENOSPC; |
222 | goto fail; | 216 | goto fail; |
223 | } | 217 | } |
224 | err = lock_to_ceph_filelock(lock, &cephlock); | 218 | err = lock_to_ceph_filelock(lock, &flocks[l]); |
225 | if (err) | 219 | if (err) |
226 | goto fail; | 220 | goto fail; |
227 | err = ceph_pagelist_append(pagelist, &cephlock, | 221 | ++l; |
228 | sizeof(struct ceph_filelock)); | ||
229 | } | 222 | } |
230 | if (err) | ||
231 | goto fail; | ||
232 | } | 223 | } |
233 | |||
234 | nlocks = cpu_to_le32(num_flock_locks); | ||
235 | err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | ||
236 | if (err) | ||
237 | goto fail; | ||
238 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { | 224 | for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { |
239 | if (lock->fl_flags & FL_FLOCK) { | 225 | if (lock->fl_flags & FL_FLOCK) { |
240 | ++seen_flock; | 226 | ++seen_flock; |
@@ -242,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, | |||
242 | err = -ENOSPC; | 228 | err = -ENOSPC; |
243 | goto fail; | 229 | goto fail; |
244 | } | 230 | } |
245 | err = lock_to_ceph_filelock(lock, &cephlock); | 231 | err = lock_to_ceph_filelock(lock, &flocks[l]); |
246 | if (err) | 232 | if (err) |
247 | goto fail; | 233 | goto fail; |
248 | err = ceph_pagelist_append(pagelist, &cephlock, | 234 | ++l; |
249 | sizeof(struct ceph_filelock)); | ||
250 | } | 235 | } |
251 | if (err) | ||
252 | goto fail; | ||
253 | } | 236 | } |
254 | fail: | 237 | fail: |
255 | return err; | 238 | return err; |
256 | } | 239 | } |
257 | 240 | ||
241 | /** | ||
242 | * Copy the encoded flock and fcntl locks into the pagelist. | ||
243 | * Format is: #fcntl locks, sequential fcntl locks, #flock locks, | ||
244 | * sequential flock locks. | ||
245 | * Returns zero on success. | ||
246 | */ | ||
247 | int ceph_locks_to_pagelist(struct ceph_filelock *flocks, | ||
248 | struct ceph_pagelist *pagelist, | ||
249 | int num_fcntl_locks, int num_flock_locks) | ||
250 | { | ||
251 | int err = 0; | ||
252 | __le32 nlocks; | ||
253 | |||
254 | nlocks = cpu_to_le32(num_fcntl_locks); | ||
255 | err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | ||
256 | if (err) | ||
257 | goto out_fail; | ||
258 | |||
259 | err = ceph_pagelist_append(pagelist, flocks, | ||
260 | num_fcntl_locks * sizeof(*flocks)); | ||
261 | if (err) | ||
262 | goto out_fail; | ||
263 | |||
264 | nlocks = cpu_to_le32(num_flock_locks); | ||
265 | err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); | ||
266 | if (err) | ||
267 | goto out_fail; | ||
268 | |||
269 | err = ceph_pagelist_append(pagelist, | ||
270 | &flocks[num_fcntl_locks], | ||
271 | num_flock_locks * sizeof(*flocks)); | ||
272 | out_fail: | ||
273 | return err; | ||
274 | } | ||
275 | |||
258 | /* | 276 | /* |
259 | * Given a pointer to a lock, convert it to a ceph filelock | 277 | * Given a pointer to a lock, convert it to a ceph filelock |
260 | */ | 278 | */ |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d9ca15255477..4d2920304be8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -2478,39 +2478,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
2478 | 2478 | ||
2479 | if (recon_state->flock) { | 2479 | if (recon_state->flock) { |
2480 | int num_fcntl_locks, num_flock_locks; | 2480 | int num_fcntl_locks, num_flock_locks; |
2481 | struct ceph_pagelist_cursor trunc_point; | 2481 | struct ceph_filelock *flocks; |
2482 | 2482 | ||
2483 | ceph_pagelist_set_cursor(pagelist, &trunc_point); | 2483 | encode_again: |
2484 | do { | 2484 | lock_flocks(); |
2485 | lock_flocks(); | 2485 | ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); |
2486 | ceph_count_locks(inode, &num_fcntl_locks, | 2486 | unlock_flocks(); |
2487 | &num_flock_locks); | 2487 | flocks = kmalloc((num_fcntl_locks+num_flock_locks) * |
2488 | rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + | 2488 | sizeof(struct ceph_filelock), GFP_NOFS); |
2489 | (num_fcntl_locks+num_flock_locks) * | 2489 | if (!flocks) { |
2490 | sizeof(struct ceph_filelock)); | 2490 | err = -ENOMEM; |
2491 | unlock_flocks(); | 2491 | goto out_free; |
2492 | 2492 | } | |
2493 | /* pre-alloc pagelist */ | 2493 | lock_flocks(); |
2494 | ceph_pagelist_truncate(pagelist, &trunc_point); | 2494 | err = ceph_encode_locks_to_buffer(inode, flocks, |
2495 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2495 | num_fcntl_locks, |
2496 | if (!err) | 2496 | num_flock_locks); |
2497 | err = ceph_pagelist_reserve(pagelist, | 2497 | unlock_flocks(); |
2498 | rec.v2.flock_len); | 2498 | if (err) { |
2499 | 2499 | kfree(flocks); | |
2500 | /* encode locks */ | 2500 | if (err == -ENOSPC) |
2501 | if (!err) { | 2501 | goto encode_again; |
2502 | lock_flocks(); | 2502 | goto out_free; |
2503 | err = ceph_encode_locks(inode, | 2503 | } |
2504 | pagelist, | 2504 | /* |
2505 | num_fcntl_locks, | 2505 | * number of encoded locks is stable, so copy to pagelist |
2506 | num_flock_locks); | 2506 | */ |
2507 | unlock_flocks(); | 2507 | rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + |
2508 | } | 2508 | (num_fcntl_locks+num_flock_locks) * |
2509 | } while (err == -ENOSPC); | 2509 | sizeof(struct ceph_filelock)); |
2510 | err = ceph_pagelist_append(pagelist, &rec, reclen); | ||
2511 | if (!err) | ||
2512 | err = ceph_locks_to_pagelist(flocks, pagelist, | ||
2513 | num_fcntl_locks, | ||
2514 | num_flock_locks); | ||
2515 | kfree(flocks); | ||
2510 | } else { | 2516 | } else { |
2511 | err = ceph_pagelist_append(pagelist, &rec, reclen); | 2517 | err = ceph_pagelist_append(pagelist, &rec, reclen); |
2512 | } | 2518 | } |
2513 | |||
2514 | out_free: | 2519 | out_free: |
2515 | kfree(path); | 2520 | kfree(path); |
2516 | out_dput: | 2521 | out_dput: |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8696be2ff679..7ccfdb4aea2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops; | |||
822 | extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); | 822 | extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); |
823 | extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); | 823 | extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); |
824 | extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); | 824 | extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); |
825 | extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, | 825 | extern int ceph_encode_locks_to_buffer(struct inode *inode, |
826 | int p_locks, int f_locks); | 826 | struct ceph_filelock *flocks, |
827 | int num_fcntl_locks, | ||
828 | int num_flock_locks); | ||
829 | extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, | ||
830 | struct ceph_pagelist *pagelist, | ||
831 | int num_fcntl_locks, int num_flock_locks); | ||
827 | extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); | 832 | extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); |
828 | 833 | ||
829 | /* debugfs.c */ | 834 | /* debugfs.c */ |