summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cmservice.c78
-rw-r--r--fs/afs/fsclient.c221
-rw-r--r--fs/afs/internal.h14
-rw-r--r--fs/afs/rxrpc.c73
-rw-r--r--fs/afs/vlclient.c11
-rw-r--r--fs/aio.c7
-rw-r--r--fs/autofs4/expire.c55
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/delayed-ref.c34
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c223
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c36
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c83
-rw-r--r--fs/btrfs/ioctl.c14
-rw-r--r--fs/btrfs/qgroup.c62
-rw-r--r--fs/btrfs/qgroup.h36
-rw-r--r--fs/btrfs/relocation.c128
-rw-r--r--fs/btrfs/root-tree.c27
-rw-r--r--fs/btrfs/send.c181
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c107
-rw-r--r--fs/btrfs/tree-log.h5
-rw-r--r--fs/btrfs/volumes.c27
-rw-r--r--fs/ceph/caps.c5
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/mds_client.c1
-rw-r--r--fs/cifs/cifsfs.c29
-rw-r--r--fs/cifs/cifsproto.h2
-rw-r--r--fs/cifs/connect.c31
-rw-r--r--fs/configfs/file.c1
-rw-r--r--fs/crypto/policy.c41
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c62
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/super.c18
-rw-r--r--fs/ext4/xattr.c53
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/f2fs/f2fs.h12
-rw-r--r--fs/f2fs/file.c22
-rw-r--r--fs/f2fs/node.c47
-rw-r--r--fs/f2fs/super.c6
-rw-r--r--fs/fs-writeback.c6
-rw-r--r--fs/fuse/file.c7
-rw-r--r--fs/ioctl.c6
-rw-r--r--fs/iomap.c26
-rw-r--r--fs/kernfs/file.c28
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.h3
-rw-r--r--fs/nfs/blocklayout/extent_tree.c10
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/callback_proc.c8
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c45
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c23
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/nfs42proc.c36
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4proc.c119
-rw-r--r--fs/nfs/nfs4renewd.c20
-rw-r--r--fs/nfs/nfs4session.c53
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c9
-rw-r--r--fs/nfs/pnfs.c44
-rw-r--r--fs/nfs/super.c19
-rw-r--r--fs/nfsd/nfs4state.c65
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/fanotify/fanotify.c13
-rw-r--r--fs/notify/fanotify/fanotify_user.c36
-rw-r--r--fs/notify/group.c19
-rw-r--r--fs/notify/notification.c23
-rw-r--r--fs/ocfs2/alloc.c56
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c12
-rw-r--r--fs/ocfs2/file.c34
-rw-r--r--fs/ocfs2/suballoc.c14
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/dir.c58
-rw-r--r--fs/overlayfs/inode.c108
-rw-r--r--fs/overlayfs/overlayfs.h17
-rw-r--r--fs/overlayfs/readdir.c63
-rw-r--r--fs/overlayfs/super.c93
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/kcore.c31
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/ramfs/file-mmu.c9
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/sysfs/file.c8
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/xattr.c5
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_defer.c17
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_iomap.c69
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_trace.h3
119 files changed, 2206 insertions, 1043 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 4b0eff6da674..85737e96ab8b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
189 case 1: 189 case 1:
190 _debug("extract FID count"); 190 _debug("extract FID count");
191 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 191 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
192 switch (ret) { 192 if (ret < 0)
193 case 0: break; 193 return ret;
194 case -EAGAIN: return 0;
195 default: return ret;
196 }
197 194
198 call->count = ntohl(call->tmp); 195 call->count = ntohl(call->tmp);
199 _debug("FID count: %u", call->count); 196 _debug("FID count: %u", call->count);
@@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
210 _debug("extract FID array"); 207 _debug("extract FID array");
211 ret = afs_extract_data(call, skb, last, call->buffer, 208 ret = afs_extract_data(call, skb, last, call->buffer,
212 call->count * 3 * 4); 209 call->count * 3 * 4);
213 switch (ret) { 210 if (ret < 0)
214 case 0: break; 211 return ret;
215 case -EAGAIN: return 0;
216 default: return ret;
217 }
218 212
219 _debug("unmarshall FID array"); 213 _debug("unmarshall FID array");
220 call->request = kcalloc(call->count, 214 call->request = kcalloc(call->count,
@@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
239 case 3: 233 case 3:
240 _debug("extract CB count"); 234 _debug("extract CB count");
241 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 235 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
242 switch (ret) { 236 if (ret < 0)
243 case 0: break; 237 return ret;
244 case -EAGAIN: return 0;
245 default: return ret;
246 }
247 238
248 tmp = ntohl(call->tmp); 239 tmp = ntohl(call->tmp);
249 _debug("CB count: %u", tmp); 240 _debug("CB count: %u", tmp);
@@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
258 _debug("extract CB array"); 249 _debug("extract CB array");
259 ret = afs_extract_data(call, skb, last, call->request, 250 ret = afs_extract_data(call, skb, last, call->request,
260 call->count * 3 * 4); 251 call->count * 3 * 4);
261 switch (ret) { 252 if (ret < 0)
262 case 0: break; 253 return ret;
263 case -EAGAIN: return 0;
264 default: return ret;
265 }
266 254
267 _debug("unmarshall CB array"); 255 _debug("unmarshall CB array");
268 cb = call->request; 256 cb = call->request;
@@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
278 call->unmarshall++; 266 call->unmarshall++;
279 267
280 case 5: 268 case 5:
281 _debug("trailer"); 269 ret = afs_data_complete(call, skb, last);
282 if (skb->len != 0) 270 if (ret < 0)
283 return -EBADMSG; 271 return ret;
284 272
285 /* Record that the message was unmarshalled successfully so 273 /* Record that the message was unmarshalled successfully so
286 * that the call destructor can know do the callback breaking 274 * that the call destructor can know do the callback breaking
@@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
294 break; 282 break;
295 } 283 }
296 284
297 if (!last)
298 return 0;
299 285
300 call->state = AFS_CALL_REPLYING; 286 call->state = AFS_CALL_REPLYING;
301 287
@@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
335{ 321{
336 struct afs_server *server; 322 struct afs_server *server;
337 struct in_addr addr; 323 struct in_addr addr;
324 int ret;
338 325
339 _enter(",{%u},%d", skb->len, last); 326 _enter(",{%u},%d", skb->len, last);
340 327
341 if (skb->len > 0) 328 ret = afs_data_complete(call, skb, last);
342 return -EBADMSG; 329 if (ret < 0)
343 if (!last) 330 return ret;
344 return 0;
345 331
346 /* no unmarshalling required */ 332 /* no unmarshalling required */
347 call->state = AFS_CALL_REPLYING; 333 call->state = AFS_CALL_REPLYING;
@@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
371 357
372 _enter(",{%u},%d", skb->len, last); 358 _enter(",{%u},%d", skb->len, last);
373 359
360 /* There are some arguments that we ignore */
361 afs_data_consumed(call, skb);
374 if (!last) 362 if (!last)
375 return 0; 363 return -EAGAIN;
376 364
377 /* no unmarshalling required */ 365 /* no unmarshalling required */
378 call->state = AFS_CALL_REPLYING; 366 call->state = AFS_CALL_REPLYING;
@@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
408static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, 396static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
409 bool last) 397 bool last)
410{ 398{
399 int ret;
400
411 _enter(",{%u},%d", skb->len, last); 401 _enter(",{%u},%d", skb->len, last);
412 402
413 if (skb->len > 0) 403 ret = afs_data_complete(call, skb, last);
414 return -EBADMSG; 404 if (ret < 0)
415 if (!last) 405 return ret;
416 return 0;
417 406
418 /* no unmarshalling required */ 407 /* no unmarshalling required */
419 call->state = AFS_CALL_REPLYING; 408 call->state = AFS_CALL_REPLYING;
@@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
460 449
461 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 450 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
462 451
463 if (skb->len > 0) 452 ret = afs_data_complete(call, skb, last);
464 return -EBADMSG; 453 if (ret < 0)
465 if (!last) 454 return ret;
466 return 0;
467 455
468 switch (call->unmarshall) { 456 switch (call->unmarshall) {
469 case 0: 457 case 0:
@@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
509 break; 497 break;
510 } 498 }
511 499
512 if (!last) 500 ret = afs_data_complete(call, skb, last);
513 return 0; 501 if (ret < 0)
502 return ret;
514 503
515 call->state = AFS_CALL_REPLYING; 504 call->state = AFS_CALL_REPLYING;
516 505
@@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
588static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, 577static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
589 struct sk_buff *skb, bool last) 578 struct sk_buff *skb, bool last)
590{ 579{
580 int ret;
581
591 _enter(",{%u},%d", skb->len, last); 582 _enter(",{%u},%d", skb->len, last);
592 583
593 if (skb->len > 0) 584 ret = afs_data_complete(call, skb, last);
594 return -EBADMSG; 585 if (ret < 0)
595 if (!last) 586 return ret;
596 return 0;
597 587
598 /* no unmarshalling required */ 588 /* no unmarshalling required */
599 call->state = AFS_CALL_REPLYING; 589 call->state = AFS_CALL_REPLYING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index c2e930ec2888..9312b92e54be 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
240{ 240{
241 struct afs_vnode *vnode = call->reply; 241 struct afs_vnode *vnode = call->reply;
242 const __be32 *bp; 242 const __be32 *bp;
243 int ret;
243 244
244 _enter(",,%u", last); 245 _enter(",,%u", last);
245 246
246 afs_transfer_reply(call, skb); 247 ret = afs_transfer_reply(call, skb, last);
247 if (!last) 248 if (ret < 0)
248 return 0; 249 return ret;
249
250 if (call->reply_size != call->reply_max)
251 return -EBADMSG;
252 250
253 /* unmarshall the reply once we've received all of it */ 251 /* unmarshall the reply once we've received all of it */
254 bp = call->buffer; 252 bp = call->buffer;
@@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
335 case 1: 333 case 1:
336 _debug("extract data length (MSW)"); 334 _debug("extract data length (MSW)");
337 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 335 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
338 switch (ret) { 336 if (ret < 0)
339 case 0: break; 337 return ret;
340 case -EAGAIN: return 0;
341 default: return ret;
342 }
343 338
344 call->count = ntohl(call->tmp); 339 call->count = ntohl(call->tmp);
345 _debug("DATA length MSW: %u", call->count); 340 _debug("DATA length MSW: %u", call->count);
@@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
353 case 2: 348 case 2:
354 _debug("extract data length"); 349 _debug("extract data length");
355 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 350 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
356 switch (ret) { 351 if (ret < 0)
357 case 0: break; 352 return ret;
358 case -EAGAIN: return 0;
359 default: return ret;
360 }
361 353
362 call->count = ntohl(call->tmp); 354 call->count = ntohl(call->tmp);
363 _debug("DATA length: %u", call->count); 355 _debug("DATA length: %u", call->count);
@@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
375 ret = afs_extract_data(call, skb, last, buffer, 367 ret = afs_extract_data(call, skb, last, buffer,
376 call->count); 368 call->count);
377 kunmap_atomic(buffer); 369 kunmap_atomic(buffer);
378 switch (ret) { 370 if (ret < 0)
379 case 0: break; 371 return ret;
380 case -EAGAIN: return 0;
381 default: return ret;
382 }
383 } 372 }
384 373
385 call->offset = 0; 374 call->offset = 0;
@@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
389 case 4: 378 case 4:
390 ret = afs_extract_data(call, skb, last, call->buffer, 379 ret = afs_extract_data(call, skb, last, call->buffer,
391 (21 + 3 + 6) * 4); 380 (21 + 3 + 6) * 4);
392 switch (ret) { 381 if (ret < 0)
393 case 0: break; 382 return ret;
394 case -EAGAIN: return 0;
395 default: return ret;
396 }
397 383
398 bp = call->buffer; 384 bp = call->buffer;
399 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL); 385 xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
@@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
405 call->unmarshall++; 391 call->unmarshall++;
406 392
407 case 5: 393 case 5:
408 _debug("trailer"); 394 ret = afs_data_complete(call, skb, last);
409 if (skb->len != 0) 395 if (ret < 0)
410 return -EBADMSG; 396 return ret;
411 break; 397 break;
412 } 398 }
413 399
414 if (!last)
415 return 0;
416
417 if (call->count < PAGE_SIZE) { 400 if (call->count < PAGE_SIZE) {
418 _debug("clear"); 401 _debug("clear");
419 page = call->reply3; 402 page = call->reply3;
@@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
537{ 520{
538 _enter(",{%u},%d", skb->len, last); 521 _enter(",{%u},%d", skb->len, last);
539 522
540 if (skb->len > 0) 523 /* shouldn't be any reply data */
541 return -EBADMSG; /* shouldn't be any reply data */ 524 return afs_data_complete(call, skb, last);
542 return 0;
543} 525}
544 526
545/* 527/*
@@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
622{ 604{
623 struct afs_vnode *vnode = call->reply; 605 struct afs_vnode *vnode = call->reply;
624 const __be32 *bp; 606 const __be32 *bp;
607 int ret;
625 608
626 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 609 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
627 610
628 afs_transfer_reply(call, skb); 611 ret = afs_transfer_reply(call, skb, last);
629 if (!last) 612 if (ret < 0)
630 return 0; 613 return ret;
631
632 if (call->reply_size != call->reply_max)
633 return -EBADMSG;
634 614
635 /* unmarshall the reply once we've received all of it */ 615 /* unmarshall the reply once we've received all of it */
636 bp = call->buffer; 616 bp = call->buffer;
@@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call,
721{ 701{
722 struct afs_vnode *vnode = call->reply; 702 struct afs_vnode *vnode = call->reply;
723 const __be32 *bp; 703 const __be32 *bp;
704 int ret;
724 705
725 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 706 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
726 707
727 afs_transfer_reply(call, skb); 708 ret = afs_transfer_reply(call, skb, last);
728 if (!last) 709 if (ret < 0)
729 return 0; 710 return ret;
730
731 if (call->reply_size != call->reply_max)
732 return -EBADMSG;
733 711
734 /* unmarshall the reply once we've received all of it */ 712 /* unmarshall the reply once we've received all of it */
735 bp = call->buffer; 713 bp = call->buffer;
@@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call,
804{ 782{
805 struct afs_vnode *dvnode = call->reply, *vnode = call->reply2; 783 struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
806 const __be32 *bp; 784 const __be32 *bp;
785 int ret;
807 786
808 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 787 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
809 788
810 afs_transfer_reply(call, skb); 789 ret = afs_transfer_reply(call, skb, last);
811 if (!last) 790 if (ret < 0)
812 return 0; 791 return ret;
813
814 if (call->reply_size != call->reply_max)
815 return -EBADMSG;
816 792
817 /* unmarshall the reply once we've received all of it */ 793 /* unmarshall the reply once we've received all of it */
818 bp = call->buffer; 794 bp = call->buffer;
@@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
892{ 868{
893 struct afs_vnode *vnode = call->reply; 869 struct afs_vnode *vnode = call->reply;
894 const __be32 *bp; 870 const __be32 *bp;
871 int ret;
895 872
896 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 873 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
897 874
898 afs_transfer_reply(call, skb); 875 ret = afs_transfer_reply(call, skb, last);
899 if (!last) 876 if (ret < 0)
900 return 0; 877 return ret;
901
902 if (call->reply_size != call->reply_max)
903 return -EBADMSG;
904 878
905 /* unmarshall the reply once we've received all of it */ 879 /* unmarshall the reply once we've received all of it */
906 bp = call->buffer; 880 bp = call->buffer;
@@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call,
999{ 973{
1000 struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2; 974 struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
1001 const __be32 *bp; 975 const __be32 *bp;
976 int ret;
1002 977
1003 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 978 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
1004 979
1005 afs_transfer_reply(call, skb); 980 ret = afs_transfer_reply(call, skb, last);
1006 if (!last) 981 if (ret < 0)
1007 return 0; 982 return ret;
1008
1009 if (call->reply_size != call->reply_max)
1010 return -EBADMSG;
1011 983
1012 /* unmarshall the reply once we've received all of it */ 984 /* unmarshall the reply once we've received all of it */
1013 bp = call->buffer; 985 bp = call->buffer;
@@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call,
1105{ 1077{
1106 struct afs_vnode *vnode = call->reply; 1078 struct afs_vnode *vnode = call->reply;
1107 const __be32 *bp; 1079 const __be32 *bp;
1080 int ret;
1108 1081
1109 _enter(",,%u", last); 1082 _enter(",,%u", last);
1110 1083
1111 afs_transfer_reply(call, skb); 1084 ret = afs_transfer_reply(call, skb, last);
1112 if (!last) { 1085 if (ret < 0)
1113 _leave(" = 0 [more]"); 1086 return ret;
1114 return 0;
1115 }
1116
1117 if (call->reply_size != call->reply_max) {
1118 _leave(" = -EBADMSG [%u != %u]",
1119 call->reply_size, call->reply_max);
1120 return -EBADMSG;
1121 }
1122 1087
1123 /* unmarshall the reply once we've received all of it */ 1088 /* unmarshall the reply once we've received all of it */
1124 bp = call->buffer; 1089 bp = call->buffer;
@@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call,
1292 afs_dataversion_t *store_version; 1257 afs_dataversion_t *store_version;
1293 struct afs_vnode *vnode = call->reply; 1258 struct afs_vnode *vnode = call->reply;
1294 const __be32 *bp; 1259 const __be32 *bp;
1260 int ret;
1295 1261
1296 _enter(",,%u", last); 1262 _enter(",,%u", last);
1297 1263
1298 afs_transfer_reply(call, skb); 1264 ret = afs_transfer_reply(call, skb, last);
1299 if (!last) { 1265 if (ret < 0)
1300 _leave(" = 0 [more]"); 1266 return ret;
1301 return 0;
1302 }
1303
1304 if (call->reply_size != call->reply_max) {
1305 _leave(" = -EBADMSG [%u != %u]",
1306 call->reply_size, call->reply_max);
1307 return -EBADMSG;
1308 }
1309 1267
1310 /* unmarshall the reply once we've received all of it */ 1268 /* unmarshall the reply once we've received all of it */
1311 store_version = NULL; 1269 store_version = NULL;
@@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1504 _debug("extract status"); 1462 _debug("extract status");
1505 ret = afs_extract_data(call, skb, last, call->buffer, 1463 ret = afs_extract_data(call, skb, last, call->buffer,
1506 12 * 4); 1464 12 * 4);
1507 switch (ret) { 1465 if (ret < 0)
1508 case 0: break; 1466 return ret;
1509 case -EAGAIN: return 0;
1510 default: return ret;
1511 }
1512 1467
1513 bp = call->buffer; 1468 bp = call->buffer;
1514 xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2); 1469 xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
@@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1518 /* extract the volume name length */ 1473 /* extract the volume name length */
1519 case 2: 1474 case 2:
1520 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 1475 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1521 switch (ret) { 1476 if (ret < 0)
1522 case 0: break; 1477 return ret;
1523 case -EAGAIN: return 0;
1524 default: return ret;
1525 }
1526 1478
1527 call->count = ntohl(call->tmp); 1479 call->count = ntohl(call->tmp);
1528 _debug("volname length: %u", call->count); 1480 _debug("volname length: %u", call->count);
@@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1537 if (call->count > 0) { 1489 if (call->count > 0) {
1538 ret = afs_extract_data(call, skb, last, call->reply3, 1490 ret = afs_extract_data(call, skb, last, call->reply3,
1539 call->count); 1491 call->count);
1540 switch (ret) { 1492 if (ret < 0)
1541 case 0: break; 1493 return ret;
1542 case -EAGAIN: return 0;
1543 default: return ret;
1544 }
1545 } 1494 }
1546 1495
1547 p = call->reply3; 1496 p = call->reply3;
@@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1561 case 4: 1510 case 4:
1562 ret = afs_extract_data(call, skb, last, call->buffer, 1511 ret = afs_extract_data(call, skb, last, call->buffer,
1563 call->count); 1512 call->count);
1564 switch (ret) { 1513 if (ret < 0)
1565 case 0: break; 1514 return ret;
1566 case -EAGAIN: return 0;
1567 default: return ret;
1568 }
1569 1515
1570 call->offset = 0; 1516 call->offset = 0;
1571 call->unmarshall++; 1517 call->unmarshall++;
@@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1574 /* extract the offline message length */ 1520 /* extract the offline message length */
1575 case 5: 1521 case 5:
1576 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 1522 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1577 switch (ret) { 1523 if (ret < 0)
1578 case 0: break; 1524 return ret;
1579 case -EAGAIN: return 0;
1580 default: return ret;
1581 }
1582 1525
1583 call->count = ntohl(call->tmp); 1526 call->count = ntohl(call->tmp);
1584 _debug("offline msg length: %u", call->count); 1527 _debug("offline msg length: %u", call->count);
@@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1593 if (call->count > 0) { 1536 if (call->count > 0) {
1594 ret = afs_extract_data(call, skb, last, call->reply3, 1537 ret = afs_extract_data(call, skb, last, call->reply3,
1595 call->count); 1538 call->count);
1596 switch (ret) { 1539 if (ret < 0)
1597 case 0: break; 1540 return ret;
1598 case -EAGAIN: return 0;
1599 default: return ret;
1600 }
1601 } 1541 }
1602 1542
1603 p = call->reply3; 1543 p = call->reply3;
@@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1617 case 7: 1557 case 7:
1618 ret = afs_extract_data(call, skb, last, call->buffer, 1558 ret = afs_extract_data(call, skb, last, call->buffer,
1619 call->count); 1559 call->count);
1620 switch (ret) { 1560 if (ret < 0)
1621 case 0: break; 1561 return ret;
1622 case -EAGAIN: return 0;
1623 default: return ret;
1624 }
1625 1562
1626 call->offset = 0; 1563 call->offset = 0;
1627 call->unmarshall++; 1564 call->unmarshall++;
@@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1630 /* extract the message of the day length */ 1567 /* extract the message of the day length */
1631 case 8: 1568 case 8:
1632 ret = afs_extract_data(call, skb, last, &call->tmp, 4); 1569 ret = afs_extract_data(call, skb, last, &call->tmp, 4);
1633 switch (ret) { 1570 if (ret < 0)
1634 case 0: break; 1571 return ret;
1635 case -EAGAIN: return 0;
1636 default: return ret;
1637 }
1638 1572
1639 call->count = ntohl(call->tmp); 1573 call->count = ntohl(call->tmp);
1640 _debug("motd length: %u", call->count); 1574 _debug("motd length: %u", call->count);
@@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1649 if (call->count > 0) { 1583 if (call->count > 0) {
1650 ret = afs_extract_data(call, skb, last, call->reply3, 1584 ret = afs_extract_data(call, skb, last, call->reply3,
1651 call->count); 1585 call->count);
1652 switch (ret) { 1586 if (ret < 0)
1653 case 0: break; 1587 return ret;
1654 case -EAGAIN: return 0;
1655 default: return ret;
1656 }
1657 } 1588 }
1658 1589
1659 p = call->reply3; 1590 p = call->reply3;
@@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
1673 case 10: 1604 case 10:
1674 ret = afs_extract_data(call, skb, last, call->buffer, 1605 ret = afs_extract_data(call, skb, last, call->buffer,
1675 call->count); 1606 call->count);
1676 switch (ret) { 1607 if (ret < 0)
1677 case 0: break; 1608 return ret;
1678 case -EAGAIN: return 0;
1679 default: return ret;
1680 }
1681 1609
1682 call->offset = 0; 1610 call->offset = 0;
1683 call->unmarshall++; 1611 call->unmarshall++;
1684 no_motd_padding: 1612 no_motd_padding:
1685 1613
1686 case 11: 1614 case 11:
1687 _debug("trailer %d", skb->len); 1615 ret = afs_data_complete(call, skb, last);
1688 if (skb->len != 0) 1616 if (ret < 0)
1689 return -EBADMSG; 1617 return ret;
1690 break; 1618 break;
1691 } 1619 }
1692 1620
1693 if (!last)
1694 return 0;
1695
1696 _leave(" = 0 [done]"); 1621 _leave(" = 0 [done]");
1697 return 0; 1622 return 0;
1698} 1623}
@@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
1764 struct sk_buff *skb, bool last) 1689 struct sk_buff *skb, bool last)
1765{ 1690{
1766 const __be32 *bp; 1691 const __be32 *bp;
1692 int ret;
1767 1693
1768 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); 1694 _enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
1769 1695
1770 afs_transfer_reply(call, skb); 1696 ret = afs_transfer_reply(call, skb, last);
1771 if (!last) 1697 if (ret < 0)
1772 return 0; 1698 return ret;
1773
1774 if (call->reply_size != call->reply_max)
1775 return -EBADMSG;
1776 1699
1777 /* unmarshall the reply once we've received all of it */ 1700 /* unmarshall the reply once we've received all of it */
1778 bp = call->buffer; 1701 bp = call->buffer;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 71d5982312f3..df976b2a7f40 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *);
609 */ 609 */
610extern int afs_open_socket(void); 610extern int afs_open_socket(void);
611extern void afs_close_socket(void); 611extern void afs_close_socket(void);
612extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
612extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, 613extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
613 const struct afs_wait_mode *); 614 const struct afs_wait_mode *);
614extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *, 615extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
615 size_t, size_t); 616 size_t, size_t);
616extern void afs_flat_call_destructor(struct afs_call *); 617extern void afs_flat_call_destructor(struct afs_call *);
617extern void afs_transfer_reply(struct afs_call *, struct sk_buff *); 618extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
618extern void afs_send_empty_reply(struct afs_call *); 619extern void afs_send_empty_reply(struct afs_call *);
619extern void afs_send_simple_reply(struct afs_call *, const void *, size_t); 620extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
620extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *, 621extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
621 size_t); 622 size_t);
622 623
624static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
625 bool last)
626{
627 if (skb->len > 0)
628 return -EBADMSG;
629 afs_data_consumed(call, skb);
630 if (!last)
631 return -EAGAIN;
632 return 0;
633}
634
623/* 635/*
624 * security.c 636 * security.c
625 */ 637 */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 4832de84d52c..14d04c848465 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -150,10 +150,9 @@ void afs_close_socket(void)
150} 150}
151 151
152/* 152/*
153 * note that the data in a socket buffer is now delivered and that the buffer 153 * Note that the data in a socket buffer is now consumed.
154 * should be freed
155 */ 154 */
156static void afs_data_delivered(struct sk_buff *skb) 155void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
157{ 156{
158 if (!skb) { 157 if (!skb) {
159 _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs)); 158 _debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
@@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb)
161 } else { 160 } else {
162 _debug("DLVR %p{%u} [%d]", 161 _debug("DLVR %p{%u} [%d]",
163 skb, skb->mark, atomic_read(&afs_outstanding_skbs)); 162 skb, skb->mark, atomic_read(&afs_outstanding_skbs));
164 if (atomic_dec_return(&afs_outstanding_skbs) == -1) 163 rxrpc_kernel_data_consumed(call->rxcall, skb);
165 BUG();
166 rxrpc_kernel_data_delivered(skb);
167 } 164 }
168} 165}
169 166
@@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call)
489 last = rxrpc_kernel_is_data_last(skb); 486 last = rxrpc_kernel_is_data_last(skb);
490 ret = call->type->deliver(call, skb, last); 487 ret = call->type->deliver(call, skb, last);
491 switch (ret) { 488 switch (ret) {
489 case -EAGAIN:
490 if (last) {
491 _debug("short data");
492 goto unmarshal_error;
493 }
494 break;
492 case 0: 495 case 0:
493 if (last && 496 ASSERT(last);
494 call->state == AFS_CALL_AWAIT_REPLY) 497 if (call->state == AFS_CALL_AWAIT_REPLY)
495 call->state = AFS_CALL_COMPLETE; 498 call->state = AFS_CALL_COMPLETE;
496 break; 499 break;
497 case -ENOTCONN: 500 case -ENOTCONN:
@@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call)
501 abort_code = RX_INVALID_OPERATION; 504 abort_code = RX_INVALID_OPERATION;
502 goto do_abort; 505 goto do_abort;
503 default: 506 default:
507 unmarshal_error:
504 abort_code = RXGEN_CC_UNMARSHAL; 508 abort_code = RXGEN_CC_UNMARSHAL;
505 if (call->state != AFS_CALL_AWAIT_REPLY) 509 if (call->state != AFS_CALL_AWAIT_REPLY)
506 abort_code = RXGEN_SS_UNMARSHAL; 510 abort_code = RXGEN_SS_UNMARSHAL;
@@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call)
511 call->state = AFS_CALL_ERROR; 515 call->state = AFS_CALL_ERROR;
512 break; 516 break;
513 } 517 }
514 afs_data_delivered(skb); 518 break;
515 skb = NULL;
516 continue;
517 case RXRPC_SKB_MARK_FINAL_ACK: 519 case RXRPC_SKB_MARK_FINAL_ACK:
518 _debug("Rcv ACK"); 520 _debug("Rcv ACK");
519 call->state = AFS_CALL_COMPLETE; 521 call->state = AFS_CALL_COMPLETE;
@@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call)
685} 687}
686 688
687/* 689/*
688 * empty a socket buffer into a flat reply buffer 690 * Empty a socket buffer into a flat reply buffer.
689 */ 691 */
690void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb) 692int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
691{ 693{
692 size_t len = skb->len; 694 size_t len = skb->len;
693 695
694 if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0) 696 if (len > call->reply_max - call->reply_size) {
695 BUG(); 697 _leave(" = -EBADMSG [%zu > %u]",
696 call->reply_size += len; 698 len, call->reply_max - call->reply_size);
699 return -EBADMSG;
700 }
701
702 if (len > 0) {
703 if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
704 len) < 0)
705 BUG();
706 call->reply_size += len;
707 }
708
709 afs_data_consumed(call, skb);
710 if (!last)
711 return -EAGAIN;
712
713 if (call->reply_size != call->reply_max) {
714 _leave(" = -EBADMSG [%u != %u]",
715 call->reply_size, call->reply_max);
716 return -EBADMSG;
717 }
718 return 0;
697} 719}
698 720
699/* 721/*
@@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
745} 767}
746 768
747/* 769/*
748 * grab the operation ID from an incoming cache manager call 770 * Grab the operation ID from an incoming cache manager call. The socket
771 * buffer is discarded on error or if we don't yet have sufficient data.
749 */ 772 */
750static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, 773static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
751 bool last) 774 bool last)
@@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
766 call->offset += len; 789 call->offset += len;
767 790
768 if (call->offset < 4) { 791 if (call->offset < 4) {
769 if (last) { 792 afs_data_consumed(call, skb);
770 _leave(" = -EBADMSG [op ID short]"); 793 _leave(" = -EAGAIN");
771 return -EBADMSG; 794 return -EAGAIN;
772 }
773 _leave(" = 0 [incomplete]");
774 return 0;
775 } 795 }
776 796
777 call->state = AFS_CALL_AWAIT_REQUEST; 797 call->state = AFS_CALL_AWAIT_REQUEST;
@@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
855} 875}
856 876
857/* 877/*
858 * extract a piece of data from the received data socket buffers 878 * Extract a piece of data from the received data socket buffers.
859 */ 879 */
860int afs_extract_data(struct afs_call *call, struct sk_buff *skb, 880int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
861 bool last, void *buf, size_t count) 881 bool last, void *buf, size_t count)
@@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
873 call->offset += len; 893 call->offset += len;
874 894
875 if (call->offset < count) { 895 if (call->offset < count) {
876 if (last) { 896 afs_data_consumed(call, skb);
877 _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
878 return -EBADMSG;
879 }
880 _leave(" = -EAGAIN"); 897 _leave(" = -EAGAIN");
881 return -EAGAIN; 898 return -EAGAIN;
882 } 899 }
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 340afd0cd182..f94d1abdc3eb 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
64 struct afs_cache_vlocation *entry; 64 struct afs_cache_vlocation *entry;
65 __be32 *bp; 65 __be32 *bp;
66 u32 tmp; 66 u32 tmp;
67 int loop; 67 int loop, ret;
68 68
69 _enter(",,%u", last); 69 _enter(",,%u", last);
70 70
71 afs_transfer_reply(call, skb); 71 ret = afs_transfer_reply(call, skb, last);
72 if (!last) 72 if (ret < 0)
73 return 0; 73 return ret;
74
75 if (call->reply_size != call->reply_max)
76 return -EBADMSG;
77 74
78 /* unmarshall the reply once we've received all of it */ 75 /* unmarshall the reply once we've received all of it */
79 entry = call->reply; 76 entry = call->reply;
diff --git a/fs/aio.c b/fs/aio.c
index fb8e45b88cd4..4fe81d1c60f9 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
239 static const struct dentry_operations ops = { 239 static const struct dentry_operations ops = {
240 .d_dname = simple_dname, 240 .d_dname = simple_dname,
241 }; 241 };
242 return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); 242 struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
243 AIO_RING_MAGIC);
244
245 if (!IS_ERR(root))
246 root->d_sb->s_iflags |= SB_I_NOEXEC;
247 return root;
243} 248}
244 249
245/* aio_setup 250/* aio_setup
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909e7492..d8e6d421c27f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
417 } 417 }
418 return NULL; 418 return NULL;
419} 419}
420
420/* 421/*
421 * Find an eligible tree to time-out 422 * Find an eligible tree to time-out
422 * A tree is eligible if :- 423 * A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
432 struct dentry *root = sb->s_root; 433 struct dentry *root = sb->s_root;
433 struct dentry *dentry; 434 struct dentry *dentry;
434 struct dentry *expired; 435 struct dentry *expired;
436 struct dentry *found;
435 struct autofs_info *ino; 437 struct autofs_info *ino;
436 438
437 if (!root) 439 if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
442 444
443 dentry = NULL; 445 dentry = NULL;
444 while ((dentry = get_next_positive_subdir(dentry, root))) { 446 while ((dentry = get_next_positive_subdir(dentry, root))) {
447 int flags = how;
448
445 spin_lock(&sbi->fs_lock); 449 spin_lock(&sbi->fs_lock);
446 ino = autofs4_dentry_ino(dentry); 450 ino = autofs4_dentry_ino(dentry);
447 if (ino->flags & AUTOFS_INF_WANT_EXPIRE) 451 if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
448 expired = NULL;
449 else
450 expired = should_expire(dentry, mnt, timeout, how);
451 if (!expired) {
452 spin_unlock(&sbi->fs_lock); 452 spin_unlock(&sbi->fs_lock);
453 continue; 453 continue;
454 } 454 }
455 spin_unlock(&sbi->fs_lock);
456
457 expired = should_expire(dentry, mnt, timeout, flags);
458 if (!expired)
459 continue;
460
461 spin_lock(&sbi->fs_lock);
455 ino = autofs4_dentry_ino(expired); 462 ino = autofs4_dentry_ino(expired);
456 ino->flags |= AUTOFS_INF_WANT_EXPIRE; 463 ino->flags |= AUTOFS_INF_WANT_EXPIRE;
457 spin_unlock(&sbi->fs_lock); 464 spin_unlock(&sbi->fs_lock);
458 synchronize_rcu(); 465 synchronize_rcu();
459 spin_lock(&sbi->fs_lock);
460 if (should_expire(expired, mnt, timeout, how)) {
461 if (expired != dentry)
462 dput(dentry);
463 goto found;
464 }
465 466
467 /* Make sure a reference is not taken on found if
468 * things have changed.
469 */
470 flags &= ~AUTOFS_EXP_LEAVES;
471 found = should_expire(expired, mnt, timeout, how);
472 if (!found || found != expired)
473 /* Something has changed, continue */
474 goto next;
475
476 if (expired != dentry)
477 dput(dentry);
478
479 spin_lock(&sbi->fs_lock);
480 goto found;
481next:
482 spin_lock(&sbi->fs_lock);
466 ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; 483 ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
484 spin_unlock(&sbi->fs_lock);
467 if (expired != dentry) 485 if (expired != dentry)
468 dput(expired); 486 dput(expired);
469 spin_unlock(&sbi->fs_lock);
470 } 487 }
471 return NULL; 488 return NULL;
472 489
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
483 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 500 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
484 struct autofs_info *ino = autofs4_dentry_ino(dentry); 501 struct autofs_info *ino = autofs4_dentry_ino(dentry);
485 int status; 502 int status;
503 int state;
486 504
487 /* Block on any pending expire */ 505 /* Block on any pending expire */
488 if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) 506 if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
490 if (rcu_walk) 508 if (rcu_walk)
491 return -ECHILD; 509 return -ECHILD;
492 510
511retry:
493 spin_lock(&sbi->fs_lock); 512 spin_lock(&sbi->fs_lock);
494 if (ino->flags & AUTOFS_INF_EXPIRING) { 513 state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
514 if (state == AUTOFS_INF_WANT_EXPIRE) {
515 spin_unlock(&sbi->fs_lock);
516 /*
517 * Possibly being selected for expire, wait until
518 * it's selected or not.
519 */
520 schedule_timeout_uninterruptible(HZ/10);
521 goto retry;
522 }
523 if (state & AUTOFS_INF_EXPIRING) {
495 spin_unlock(&sbi->fs_lock); 524 spin_unlock(&sbi->fs_lock);
496 525
497 pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); 526 pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7f6aff3f72eb..e5495f37c6ed 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
853 current->flags |= PF_RANDOMIZE; 853 current->flags |= PF_RANDOMIZE;
854 854
855 setup_new_exec(bprm); 855 setup_new_exec(bprm);
856 install_exec_creds(bprm);
856 857
857 /* Do this so that we can load the interpreter, if need be. We will 858 /* Do this so that we can load the interpreter, if need be. We will
858 change some of these later */ 859 change some of these later */
@@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
1044 goto out; 1045 goto out;
1045#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ 1046#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
1046 1047
1047 install_exec_creds(bprm);
1048 retval = create_elf_tables(bprm, &loc->elf_ex, 1048 retval = create_elf_tables(bprm, &loc->elf_ex,
1049 load_addr, interp_load_addr); 1049 load_addr, interp_load_addr);
1050 if (retval < 0) 1050 if (retval < 0)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3cdde87cc8c..08ae99343d92 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
249 * thaw_bdev drops it. 249 * thaw_bdev drops it.
250 */ 250 */
251 sb = get_super(bdev); 251 sb = get_super(bdev);
252 drop_super(sb); 252 if (sb)
253 drop_super(sb);
253 mutex_unlock(&bdev->bd_fsfreeze_mutex); 254 mutex_unlock(&bdev->bd_fsfreeze_mutex);
254 return sb; 255 return sb;
255 } 256 }
@@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
646{ 647{
647 struct dentry *dent; 648 struct dentry *dent;
648 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); 649 dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
649 if (dent) 650 if (!IS_ERR(dent))
650 dent->d_sb->s_iflags |= SB_I_CGROUPWB; 651 dent->d_sb->s_iflags |= SB_I_CGROUPWB;
651 return dent; 652 return dent;
652} 653}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2b88439c2ee8..455a6b2fd539 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)
589 589
590 list_del(&ref2->list); 590 list_del(&ref2->list);
591 kmem_cache_free(btrfs_prelim_ref_cache, ref2); 591 kmem_cache_free(btrfs_prelim_ref_cache, ref2);
592 cond_resched();
592 } 593 }
593 594
594 } 595 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89091a3..33fe03551105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -427,6 +427,7 @@ struct btrfs_space_info {
427 struct list_head ro_bgs; 427 struct list_head ro_bgs;
428 struct list_head priority_tickets; 428 struct list_head priority_tickets;
429 struct list_head tickets; 429 struct list_head tickets;
430 u64 tickets_id;
430 431
431 struct rw_semaphore groups_sem; 432 struct rw_semaphore groups_sem;
432 /* for block groups in our same type */ 433 /* for block groups in our same type */
@@ -1028,6 +1029,7 @@ struct btrfs_fs_info {
1028 struct btrfs_workqueue *qgroup_rescan_workers; 1029 struct btrfs_workqueue *qgroup_rescan_workers;
1029 struct completion qgroup_rescan_completion; 1030 struct completion qgroup_rescan_completion;
1030 struct btrfs_work qgroup_rescan_work; 1031 struct btrfs_work qgroup_rescan_work;
1032 bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
1031 1033
1032 /* filesystem state */ 1034 /* filesystem state */
1033 unsigned long fs_state; 1035 unsigned long fs_state;
@@ -1079,6 +1081,8 @@ struct btrfs_fs_info {
1079 struct list_head pinned_chunks; 1081 struct list_head pinned_chunks;
1080 1082
1081 int creating_free_space_tree; 1083 int creating_free_space_tree;
1084 /* Used to record internally whether fs has been frozen */
1085 int fs_frozen;
1082}; 1086};
1083 1087
1084struct btrfs_subvolume_writers { 1088struct btrfs_subvolume_writers {
@@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
2578 struct btrfs_root *root, 2582 struct btrfs_root *root,
2579 u64 root_objectid, u64 owner, u64 offset, 2583 u64 root_objectid, u64 owner, u64 offset,
2580 struct btrfs_key *ins); 2584 struct btrfs_key *ins);
2581int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, 2585int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
2582 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 2586 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
2583 struct btrfs_key *ins, int is_data, int delalloc); 2587 struct btrfs_key *ins, int is_data, int delalloc);
2584int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2588int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e7a993..ac02e041464b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
541 struct btrfs_delayed_ref_head *existing; 541 struct btrfs_delayed_ref_head *existing;
542 struct btrfs_delayed_ref_head *head_ref = NULL; 542 struct btrfs_delayed_ref_head *head_ref = NULL;
543 struct btrfs_delayed_ref_root *delayed_refs; 543 struct btrfs_delayed_ref_root *delayed_refs;
544 struct btrfs_qgroup_extent_record *qexisting;
545 int count_mod = 1; 544 int count_mod = 1;
546 int must_insert_reserved = 0; 545 int must_insert_reserved = 0;
547 546
@@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
606 qrecord->num_bytes = num_bytes; 605 qrecord->num_bytes = num_bytes;
607 qrecord->old_roots = NULL; 606 qrecord->old_roots = NULL;
608 607
609 qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, 608 if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
610 delayed_refs, 609 delayed_refs, qrecord))
611 qrecord);
612 if (qexisting)
613 kfree(qrecord); 610 kfree(qrecord);
614 } 611 }
615 612
@@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
862 return 0; 859 return 0;
863} 860}
864 861
865int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
866 struct btrfs_trans_handle *trans,
867 u64 ref_root, u64 bytenr, u64 num_bytes)
868{
869 struct btrfs_delayed_ref_root *delayed_refs;
870 struct btrfs_delayed_ref_head *ref_head;
871 int ret = 0;
872
873 if (!fs_info->quota_enabled || !is_fstree(ref_root))
874 return 0;
875
876 delayed_refs = &trans->transaction->delayed_refs;
877
878 spin_lock(&delayed_refs->lock);
879 ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
880 if (!ref_head) {
881 ret = -ENOENT;
882 goto out;
883 }
884 WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
885 ref_head->qgroup_ref_root = ref_root;
886 ref_head->qgroup_reserved = num_bytes;
887out:
888 spin_unlock(&delayed_refs->lock);
889 return ret;
890}
891
892int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 862int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
893 struct btrfs_trans_handle *trans, 863 struct btrfs_trans_handle *trans,
894 u64 bytenr, u64 num_bytes, 864 u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
250 u64 parent, u64 ref_root, 250 u64 parent, u64 ref_root,
251 u64 owner, u64 offset, u64 reserved, int action, 251 u64 owner, u64 offset, u64 reserved, int action,
252 struct btrfs_delayed_extent_op *extent_op); 252 struct btrfs_delayed_extent_op *extent_op);
253int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
254 struct btrfs_trans_handle *trans,
255 u64 ref_root, u64 bytenr, u64 num_bytes);
256int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 253int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
257 struct btrfs_trans_handle *trans, 254 struct btrfs_trans_handle *trans,
258 u64 bytenr, u64 num_bytes, 255 u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59febfb8d04a..54bc8c7c6bcd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,
559 u32 nritems = btrfs_header_nritems(leaf); 559 u32 nritems = btrfs_header_nritems(leaf);
560 int slot; 560 int slot;
561 561
562 if (nritems == 0) 562 if (nritems == 0) {
563 struct btrfs_root *check_root;
564
565 key.objectid = btrfs_header_owner(leaf);
566 key.type = BTRFS_ROOT_ITEM_KEY;
567 key.offset = (u64)-1;
568
569 check_root = btrfs_get_fs_root(root->fs_info, &key, false);
570 /*
571 * The only reason we also check NULL here is that during
572 * open_ctree() some roots has not yet been set up.
573 */
574 if (!IS_ERR_OR_NULL(check_root)) {
575 /* if leaf is the root, then it's fine */
576 if (leaf->start !=
577 btrfs_root_bytenr(&check_root->root_item)) {
578 CORRUPT("non-root leaf's nritems is 0",
579 leaf, root, 0);
580 return -EIO;
581 }
582 }
563 return 0; 583 return 0;
584 }
564 585
565 /* Check the 0 item */ 586 /* Check the 0 item */
566 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != 587 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
@@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,
612 return 0; 633 return 0;
613} 634}
614 635
636static int check_node(struct btrfs_root *root, struct extent_buffer *node)
637{
638 unsigned long nr = btrfs_header_nritems(node);
639
640 if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
641 btrfs_crit(root->fs_info,
642 "corrupt node: block %llu root %llu nritems %lu",
643 node->start, root->objectid, nr);
644 return -EIO;
645 }
646 return 0;
647}
648
615static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, 649static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
616 u64 phy_offset, struct page *page, 650 u64 phy_offset, struct page *page,
617 u64 start, u64 end, int mirror) 651 u64 start, u64 end, int mirror)
@@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
682 ret = -EIO; 716 ret = -EIO;
683 } 717 }
684 718
719 if (found_level > 0 && check_node(root, eb))
720 ret = -EIO;
721
685 if (!ret) 722 if (!ret)
686 set_extent_buffer_uptodate(eb); 723 set_extent_buffer_uptodate(eb);
687err: 724err:
@@ -1618,8 +1655,8 @@ fail:
1618 return ret; 1655 return ret;
1619} 1656}
1620 1657
1621static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 1658struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1622 u64 root_id) 1659 u64 root_id)
1623{ 1660{
1624 struct btrfs_root *root; 1661 struct btrfs_root *root;
1625 1662
@@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2298 fs_info->quota_enabled = 0; 2335 fs_info->quota_enabled = 0;
2299 fs_info->pending_quota_state = 0; 2336 fs_info->pending_quota_state = 0;
2300 fs_info->qgroup_ulist = NULL; 2337 fs_info->qgroup_ulist = NULL;
2338 fs_info->qgroup_rescan_running = false;
2301 mutex_init(&fs_info->qgroup_rescan_lock); 2339 mutex_init(&fs_info->qgroup_rescan_lock);
2302} 2340}
2303 2341
@@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,
2624 atomic_set(&fs_info->qgroup_op_seq, 0); 2662 atomic_set(&fs_info->qgroup_op_seq, 0);
2625 atomic_set(&fs_info->reada_works_cnt, 0); 2663 atomic_set(&fs_info->reada_works_cnt, 0);
2626 atomic64_set(&fs_info->tree_mod_seq, 0); 2664 atomic64_set(&fs_info->tree_mod_seq, 0);
2665 fs_info->fs_frozen = 0;
2627 fs_info->sb = sb; 2666 fs_info->sb = sb;
2628 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; 2667 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2629 fs_info->metadata_ratio = 0; 2668 fs_info->metadata_ratio = 0;
@@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3739 if (btrfs_root_refs(&root->root_item) == 0) 3778 if (btrfs_root_refs(&root->root_item) == 0)
3740 synchronize_srcu(&fs_info->subvol_srcu); 3779 synchronize_srcu(&fs_info->subvol_srcu);
3741 3780
3742 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3781 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
3743 btrfs_free_log(NULL, root); 3782 btrfs_free_log(NULL, root);
3783 if (root->reloc_root) {
3784 free_extent_buffer(root->reloc_root->node);
3785 free_extent_buffer(root->reloc_root->commit_root);
3786 btrfs_put_fs_root(root->reloc_root);
3787 root->reloc_root = NULL;
3788 }
3789 }
3744 3790
3745 if (root->free_ino_pinned) 3791 if (root->free_ino_pinned)
3746 __btrfs_remove_free_space_cache(root->free_ino_pinned); 3792 __btrfs_remove_free_space_cache(root->free_ino_pinned);
@@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)
3851 smp_mb(); 3897 smp_mb();
3852 3898
3853 /* wait for the qgroup rescan worker to stop */ 3899 /* wait for the qgroup rescan worker to stop */
3854 btrfs_qgroup_wait_for_completion(fs_info); 3900 btrfs_qgroup_wait_for_completion(fs_info, false);
3855 3901
3856 /* wait for the uuid_scan task to finish */ 3902 /* wait for the uuid_scan task to finish */
3857 down(&fs_info->uuid_tree_rescan_sem); 3903 down(&fs_info->uuid_tree_rescan_sem);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e09f7..f19a982f5a4f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
68struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, 68struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
69 struct btrfs_key *location); 69 struct btrfs_key *location);
70int btrfs_init_fs_root(struct btrfs_root *root); 70int btrfs_init_fs_root(struct btrfs_root *root);
71struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
72 u64 root_id);
71int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, 73int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
72 struct btrfs_root *root); 74 struct btrfs_root *root);
73void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); 75void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61b494e8e604..665da8f66ff1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
60 CHUNK_ALLOC_FORCE = 2, 60 CHUNK_ALLOC_FORCE = 2,
61}; 61};
62 62
63/*
64 * Control how reservations are dealt with.
65 *
66 * RESERVE_FREE - freeing a reservation.
67 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
68 * ENOSPC accounting
69 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
70 * bytes_may_use as the ENOSPC accounting is done elsewhere
71 */
72enum {
73 RESERVE_FREE = 0,
74 RESERVE_ALLOC = 1,
75 RESERVE_ALLOC_NO_ACCOUNT = 2,
76};
77
78static int update_block_group(struct btrfs_trans_handle *trans, 63static int update_block_group(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, u64 bytenr, 64 struct btrfs_root *root, u64 bytenr,
80 u64 num_bytes, int alloc); 65 u64 num_bytes, int alloc);
@@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,
104 struct btrfs_key *key); 89 struct btrfs_key *key);
105static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 90static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
106 int dump_block_groups); 91 int dump_block_groups);
107static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 92static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
108 u64 num_bytes, int reserve, 93 u64 ram_bytes, u64 num_bytes, int delalloc);
109 int delalloc); 94static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
95 u64 num_bytes, int delalloc);
110static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, 96static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
111 u64 num_bytes); 97 u64 num_bytes);
112int btrfs_pin_extent(struct btrfs_root *root, 98int btrfs_pin_extent(struct btrfs_root *root,
@@ -3501,7 +3487,6 @@ again:
3501 dcs = BTRFS_DC_SETUP; 3487 dcs = BTRFS_DC_SETUP;
3502 else if (ret == -ENOSPC) 3488 else if (ret == -ENOSPC)
3503 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); 3489 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3504 btrfs_free_reserved_data_space(inode, 0, num_pages);
3505 3490
3506out_put: 3491out_put:
3507 iput(inode); 3492 iput(inode);
@@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4286 if (ret < 0) 4271 if (ret < 0)
4287 return ret; 4272 return ret;
4288 4273
4289 /* 4274 /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4290 * Use new btrfs_qgroup_reserve_data to reserve precious data space
4291 *
4292 * TODO: Find a good method to avoid reserve data space for NOCOW
4293 * range, but don't impact performance on quota disable case.
4294 */
4295 ret = btrfs_qgroup_reserve_data(inode, start, len); 4275 ret = btrfs_qgroup_reserve_data(inode, start, len);
4276 if (ret)
4277 btrfs_free_reserved_data_space_noquota(inode, start, len);
4296 return ret; 4278 return ret;
4297} 4279}
4298 4280
@@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
4472 } 4454 }
4473} 4455}
4474 4456
4457/*
4458 * If force is CHUNK_ALLOC_FORCE:
4459 * - return 1 if it successfully allocates a chunk,
4460 * - return errors including -ENOSPC otherwise.
4461 * If force is NOT CHUNK_ALLOC_FORCE:
4462 * - return 0 if it doesn't need to allocate a new chunk,
4463 * - return 1 if it successfully allocates a chunk,
4464 * - return errors including -ENOSPC otherwise.
4465 */
4475static int do_chunk_alloc(struct btrfs_trans_handle *trans, 4466static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4476 struct btrfs_root *extent_root, u64 flags, int force) 4467 struct btrfs_root *extent_root, u64 flags, int force)
4477{ 4468{
@@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root,
4882 btrfs_get_alloc_profile(root, 0), 4873 btrfs_get_alloc_profile(root, 0),
4883 CHUNK_ALLOC_NO_FORCE); 4874 CHUNK_ALLOC_NO_FORCE);
4884 btrfs_end_transaction(trans, root); 4875 btrfs_end_transaction(trans, root);
4885 if (ret == -ENOSPC) 4876 if (ret > 0 || ret == -ENOSPC)
4886 ret = 0; 4877 ret = 0;
4887 break; 4878 break;
4888 case COMMIT_TRANS: 4879 case COMMIT_TRANS:
@@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4907 u64 expected; 4898 u64 expected;
4908 u64 to_reclaim = 0; 4899 u64 to_reclaim = 0;
4909 4900
4910 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4911 if (can_overcommit(root, space_info, to_reclaim,
4912 BTRFS_RESERVE_FLUSH_ALL))
4913 return 0;
4914
4915 list_for_each_entry(ticket, &space_info->tickets, list) 4901 list_for_each_entry(ticket, &space_info->tickets, list)
4916 to_reclaim += ticket->bytes; 4902 to_reclaim += ticket->bytes;
4917 list_for_each_entry(ticket, &space_info->priority_tickets, list) 4903 list_for_each_entry(ticket, &space_info->priority_tickets, list)
@@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4919 if (to_reclaim) 4905 if (to_reclaim)
4920 return to_reclaim; 4906 return to_reclaim;
4921 4907
4908 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4909 if (can_overcommit(root, space_info, to_reclaim,
4910 BTRFS_RESERVE_FLUSH_ALL))
4911 return 0;
4912
4922 used = space_info->bytes_used + space_info->bytes_reserved + 4913 used = space_info->bytes_used + space_info->bytes_reserved +
4923 space_info->bytes_pinned + space_info->bytes_readonly + 4914 space_info->bytes_pinned + space_info->bytes_readonly +
4924 space_info->bytes_may_use; 4915 space_info->bytes_may_use;
@@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head)
4972 */ 4963 */
4973static void btrfs_async_reclaim_metadata_space(struct work_struct *work) 4964static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4974{ 4965{
4975 struct reserve_ticket *last_ticket = NULL;
4976 struct btrfs_fs_info *fs_info; 4966 struct btrfs_fs_info *fs_info;
4977 struct btrfs_space_info *space_info; 4967 struct btrfs_space_info *space_info;
4978 u64 to_reclaim; 4968 u64 to_reclaim;
4979 int flush_state; 4969 int flush_state;
4980 int commit_cycles = 0; 4970 int commit_cycles = 0;
4971 u64 last_tickets_id;
4981 4972
4982 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); 4973 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4983 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4974 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4990 spin_unlock(&space_info->lock); 4981 spin_unlock(&space_info->lock);
4991 return; 4982 return;
4992 } 4983 }
4993 last_ticket = list_first_entry(&space_info->tickets, 4984 last_tickets_id = space_info->tickets_id;
4994 struct reserve_ticket, list);
4995 spin_unlock(&space_info->lock); 4985 spin_unlock(&space_info->lock);
4996 4986
4997 flush_state = FLUSH_DELAYED_ITEMS_NR; 4987 flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5011 space_info); 5001 space_info);
5012 ticket = list_first_entry(&space_info->tickets, 5002 ticket = list_first_entry(&space_info->tickets,
5013 struct reserve_ticket, list); 5003 struct reserve_ticket, list);
5014 if (last_ticket == ticket) { 5004 if (last_tickets_id == space_info->tickets_id) {
5015 flush_state++; 5005 flush_state++;
5016 } else { 5006 } else {
5017 last_ticket = ticket; 5007 last_tickets_id = space_info->tickets_id;
5018 flush_state = FLUSH_DELAYED_ITEMS_NR; 5008 flush_state = FLUSH_DELAYED_ITEMS_NR;
5019 if (commit_cycles) 5009 if (commit_cycles)
5020 commit_cycles--; 5010 commit_cycles--;
@@ -5390,6 +5380,7 @@ again:
5390 list_del_init(&ticket->list); 5380 list_del_init(&ticket->list);
5391 num_bytes -= ticket->bytes; 5381 num_bytes -= ticket->bytes;
5392 ticket->bytes = 0; 5382 ticket->bytes = 0;
5383 space_info->tickets_id++;
5393 wake_up(&ticket->wait); 5384 wake_up(&ticket->wait);
5394 } else { 5385 } else {
5395 ticket->bytes -= num_bytes; 5386 ticket->bytes -= num_bytes;
@@ -5432,6 +5423,7 @@ again:
5432 num_bytes -= ticket->bytes; 5423 num_bytes -= ticket->bytes;
5433 space_info->bytes_may_use += ticket->bytes; 5424 space_info->bytes_may_use += ticket->bytes;
5434 ticket->bytes = 0; 5425 ticket->bytes = 0;
5426 space_info->tickets_id++;
5435 wake_up(&ticket->wait); 5427 wake_up(&ticket->wait);
5436 } else { 5428 } else {
5437 trace_btrfs_space_reservation(fs_info, "space_info", 5429 trace_btrfs_space_reservation(fs_info, "space_info",
@@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6497} 6489}
6498 6490
6499/** 6491/**
6500 * btrfs_update_reserved_bytes - update the block_group and space info counters 6492 * btrfs_add_reserved_bytes - update the block_group and space info counters
6501 * @cache: The cache we are manipulating 6493 * @cache: The cache we are manipulating
6494 * @ram_bytes: The number of bytes of file content, and will be same to
6495 * @num_bytes except for the compress path.
6502 * @num_bytes: The number of bytes in question 6496 * @num_bytes: The number of bytes in question
6503 * @reserve: One of the reservation enums
6504 * @delalloc: The blocks are allocated for the delalloc write 6497 * @delalloc: The blocks are allocated for the delalloc write
6505 * 6498 *
6506 * This is called by the allocator when it reserves space, or by somebody who is 6499 * This is called by the allocator when it reserves space. Metadata
6507 * freeing space that was never actually used on disk. For example if you 6500 * reservations should be called with RESERVE_ALLOC so we do the proper
6508 * reserve some space for a new leaf in transaction A and before transaction A
6509 * commits you free that leaf, you call this with reserve set to 0 in order to
6510 * clear the reservation.
6511 *
6512 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
6513 * ENOSPC accounting. For data we handle the reservation through clearing the 6501 * ENOSPC accounting. For data we handle the reservation through clearing the
6514 * delalloc bits in the io_tree. We have to do this since we could end up 6502 * delalloc bits in the io_tree. We have to do this since we could end up
6515 * allocating less disk space for the amount of data we have reserved in the 6503 * allocating less disk space for the amount of data we have reserved in the
@@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6519 * make the reservation and return -EAGAIN, otherwise this function always 6507 * make the reservation and return -EAGAIN, otherwise this function always
6520 * succeeds. 6508 * succeeds.
6521 */ 6509 */
6522static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 6510static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6523 u64 num_bytes, int reserve, int delalloc) 6511 u64 ram_bytes, u64 num_bytes, int delalloc)
6524{ 6512{
6525 struct btrfs_space_info *space_info = cache->space_info; 6513 struct btrfs_space_info *space_info = cache->space_info;
6526 int ret = 0; 6514 int ret = 0;
6527 6515
6528 spin_lock(&space_info->lock); 6516 spin_lock(&space_info->lock);
6529 spin_lock(&cache->lock); 6517 spin_lock(&cache->lock);
6530 if (reserve != RESERVE_FREE) { 6518 if (cache->ro) {
6531 if (cache->ro) { 6519 ret = -EAGAIN;
6532 ret = -EAGAIN;
6533 } else {
6534 cache->reserved += num_bytes;
6535 space_info->bytes_reserved += num_bytes;
6536 if (reserve == RESERVE_ALLOC) {
6537 trace_btrfs_space_reservation(cache->fs_info,
6538 "space_info", space_info->flags,
6539 num_bytes, 0);
6540 space_info->bytes_may_use -= num_bytes;
6541 }
6542
6543 if (delalloc)
6544 cache->delalloc_bytes += num_bytes;
6545 }
6546 } else { 6520 } else {
6547 if (cache->ro) 6521 cache->reserved += num_bytes;
6548 space_info->bytes_readonly += num_bytes; 6522 space_info->bytes_reserved += num_bytes;
6549 cache->reserved -= num_bytes;
6550 space_info->bytes_reserved -= num_bytes;
6551 6523
6524 trace_btrfs_space_reservation(cache->fs_info,
6525 "space_info", space_info->flags,
6526 ram_bytes, 0);
6527 space_info->bytes_may_use -= ram_bytes;
6552 if (delalloc) 6528 if (delalloc)
6553 cache->delalloc_bytes -= num_bytes; 6529 cache->delalloc_bytes += num_bytes;
6554 } 6530 }
6555 spin_unlock(&cache->lock); 6531 spin_unlock(&cache->lock);
6556 spin_unlock(&space_info->lock); 6532 spin_unlock(&space_info->lock);
6557 return ret; 6533 return ret;
6558} 6534}
6559 6535
6536/**
6537 * btrfs_free_reserved_bytes - update the block_group and space info counters
6538 * @cache: The cache we are manipulating
6539 * @num_bytes: The number of bytes in question
6540 * @delalloc: The blocks are allocated for the delalloc write
6541 *
6542 * This is called by somebody who is freeing space that was never actually used
6543 * on disk. For example if you reserve some space for a new leaf in transaction
6544 * A and before transaction A commits you free that leaf, you call this with
6545 * reserve set to 0 in order to clear the reservation.
6546 */
6547
6548static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6549 u64 num_bytes, int delalloc)
6550{
6551 struct btrfs_space_info *space_info = cache->space_info;
6552 int ret = 0;
6553
6554 spin_lock(&space_info->lock);
6555 spin_lock(&cache->lock);
6556 if (cache->ro)
6557 space_info->bytes_readonly += num_bytes;
6558 cache->reserved -= num_bytes;
6559 space_info->bytes_reserved -= num_bytes;
6560
6561 if (delalloc)
6562 cache->delalloc_bytes -= num_bytes;
6563 spin_unlock(&cache->lock);
6564 spin_unlock(&space_info->lock);
6565 return ret;
6566}
6560void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 6567void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6561 struct btrfs_root *root) 6568 struct btrfs_root *root)
6562{ 6569{
@@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7191 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 7198 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7192 7199
7193 btrfs_add_free_space(cache, buf->start, buf->len); 7200 btrfs_add_free_space(cache, buf->start, buf->len);
7194 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); 7201 btrfs_free_reserved_bytes(cache, buf->len, 0);
7195 btrfs_put_block_group(cache); 7202 btrfs_put_block_group(cache);
7196 trace_btrfs_reserved_extent_free(root, buf->start, buf->len); 7203 trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7197 pin = 0; 7204 pin = 0;
@@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7416 * the free space extent currently. 7423 * the free space extent currently.
7417 */ 7424 */
7418static noinline int find_free_extent(struct btrfs_root *orig_root, 7425static noinline int find_free_extent(struct btrfs_root *orig_root,
7419 u64 num_bytes, u64 empty_size, 7426 u64 ram_bytes, u64 num_bytes, u64 empty_size,
7420 u64 hint_byte, struct btrfs_key *ins, 7427 u64 hint_byte, struct btrfs_key *ins,
7421 u64 flags, int delalloc) 7428 u64 flags, int delalloc)
7422{ 7429{
7423 int ret = 0; 7430 int ret = 0;
7424 struct btrfs_root *root = orig_root->fs_info->extent_root; 7431 struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
7430 struct btrfs_space_info *space_info; 7437 struct btrfs_space_info *space_info;
7431 int loop = 0; 7438 int loop = 0;
7432 int index = __get_raid_index(flags); 7439 int index = __get_raid_index(flags);
7433 int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
7434 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
7435 bool failed_cluster_refill = false; 7440 bool failed_cluster_refill = false;
7436 bool failed_alloc = false; 7441 bool failed_alloc = false;
7437 bool use_cluster = true; 7442 bool use_cluster = true;
@@ -7763,8 +7768,8 @@ checks:
7763 search_start - offset); 7768 search_start - offset);
7764 BUG_ON(offset > search_start); 7769 BUG_ON(offset > search_start);
7765 7770
7766 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 7771 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7767 alloc_type, delalloc); 7772 num_bytes, delalloc);
7768 if (ret == -EAGAIN) { 7773 if (ret == -EAGAIN) {
7769 btrfs_add_free_space(block_group, offset, num_bytes); 7774 btrfs_add_free_space(block_group, offset, num_bytes);
7770 goto loop; 7775 goto loop;
@@ -7936,7 +7941,7 @@ again:
7936 up_read(&info->groups_sem); 7941 up_read(&info->groups_sem);
7937} 7942}
7938 7943
7939int btrfs_reserve_extent(struct btrfs_root *root, 7944int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7940 u64 num_bytes, u64 min_alloc_size, 7945 u64 num_bytes, u64 min_alloc_size,
7941 u64 empty_size, u64 hint_byte, 7946 u64 empty_size, u64 hint_byte,
7942 struct btrfs_key *ins, int is_data, int delalloc) 7947 struct btrfs_key *ins, int is_data, int delalloc)
@@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
7948 flags = btrfs_get_alloc_profile(root, is_data); 7953 flags = btrfs_get_alloc_profile(root, is_data);
7949again: 7954again:
7950 WARN_ON(num_bytes < root->sectorsize); 7955 WARN_ON(num_bytes < root->sectorsize);
7951 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 7956 ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
7952 flags, delalloc); 7957 hint_byte, ins, flags, delalloc);
7953 if (!ret && !is_data) { 7958 if (!ret && !is_data) {
7954 btrfs_dec_block_group_reservations(root->fs_info, 7959 btrfs_dec_block_group_reservations(root->fs_info,
7955 ins->objectid); 7960 ins->objectid);
@@ -7958,6 +7963,7 @@ again:
7958 num_bytes = min(num_bytes >> 1, ins->offset); 7963 num_bytes = min(num_bytes >> 1, ins->offset);
7959 num_bytes = round_down(num_bytes, root->sectorsize); 7964 num_bytes = round_down(num_bytes, root->sectorsize);
7960 num_bytes = max(num_bytes, min_alloc_size); 7965 num_bytes = max(num_bytes, min_alloc_size);
7966 ram_bytes = num_bytes;
7961 if (num_bytes == min_alloc_size) 7967 if (num_bytes == min_alloc_size)
7962 final_tried = true; 7968 final_tried = true;
7963 goto again; 7969 goto again;
@@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7995 if (btrfs_test_opt(root->fs_info, DISCARD)) 8001 if (btrfs_test_opt(root->fs_info, DISCARD))
7996 ret = btrfs_discard_extent(root, start, len, NULL); 8002 ret = btrfs_discard_extent(root, start, len, NULL);
7997 btrfs_add_free_space(cache, start, len); 8003 btrfs_add_free_space(cache, start, len);
7998 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 8004 btrfs_free_reserved_bytes(cache, len, delalloc);
7999 trace_btrfs_reserved_extent_free(root, start, len); 8005 trace_btrfs_reserved_extent_free(root, start, len);
8000 } 8006 }
8001 8007
@@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8208{ 8214{
8209 int ret; 8215 int ret;
8210 struct btrfs_block_group_cache *block_group; 8216 struct btrfs_block_group_cache *block_group;
8217 struct btrfs_space_info *space_info;
8211 8218
8212 /* 8219 /*
8213 * Mixed block groups will exclude before processing the log so we only 8220 * Mixed block groups will exclude before processing the log so we only
@@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8223 if (!block_group) 8230 if (!block_group)
8224 return -EINVAL; 8231 return -EINVAL;
8225 8232
8226 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 8233 space_info = block_group->space_info;
8227 RESERVE_ALLOC_NO_ACCOUNT, 0); 8234 spin_lock(&space_info->lock);
8228 BUG_ON(ret); /* logic error */ 8235 spin_lock(&block_group->lock);
8236 space_info->bytes_reserved += ins->offset;
8237 block_group->reserved += ins->offset;
8238 spin_unlock(&block_group->lock);
8239 spin_unlock(&space_info->lock);
8240
8229 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 8241 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
8230 0, owner, offset, ins, 1); 8242 0, owner, offset, ins, 1);
8231 btrfs_put_block_group(block_group); 8243 btrfs_put_block_group(block_group);
@@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8368 if (IS_ERR(block_rsv)) 8380 if (IS_ERR(block_rsv))
8369 return ERR_CAST(block_rsv); 8381 return ERR_CAST(block_rsv);
8370 8382
8371 ret = btrfs_reserve_extent(root, blocksize, blocksize, 8383 ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8372 empty_size, hint, &ins, 0, 0); 8384 empty_size, hint, &ins, 0, 0);
8373 if (ret) 8385 if (ret)
8374 goto out_unuse; 8386 goto out_unuse;
@@ -8521,35 +8533,6 @@ reada:
8521 wc->reada_slot = slot; 8533 wc->reada_slot = slot;
8522} 8534}
8523 8535
8524/*
8525 * These may not be seen by the usual inc/dec ref code so we have to
8526 * add them here.
8527 */
8528static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
8529 struct btrfs_root *root, u64 bytenr,
8530 u64 num_bytes)
8531{
8532 struct btrfs_qgroup_extent_record *qrecord;
8533 struct btrfs_delayed_ref_root *delayed_refs;
8534
8535 qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
8536 if (!qrecord)
8537 return -ENOMEM;
8538
8539 qrecord->bytenr = bytenr;
8540 qrecord->num_bytes = num_bytes;
8541 qrecord->old_roots = NULL;
8542
8543 delayed_refs = &trans->transaction->delayed_refs;
8544 spin_lock(&delayed_refs->lock);
8545 if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
8546 delayed_refs, qrecord))
8547 kfree(qrecord);
8548 spin_unlock(&delayed_refs->lock);
8549
8550 return 0;
8551}
8552
8553static int account_leaf_items(struct btrfs_trans_handle *trans, 8536static int account_leaf_items(struct btrfs_trans_handle *trans,
8554 struct btrfs_root *root, 8537 struct btrfs_root *root,
8555 struct extent_buffer *eb) 8538 struct extent_buffer *eb)
@@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
8583 8566
8584 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 8567 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8585 8568
8586 ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); 8569 ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
8570 bytenr, num_bytes, GFP_NOFS);
8587 if (ret) 8571 if (ret)
8588 return ret; 8572 return ret;
8589 } 8573 }
@@ -8732,8 +8716,9 @@ walk_down:
8732 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); 8716 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
8733 path->locks[level] = BTRFS_READ_LOCK_BLOCKING; 8717 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8734 8718
8735 ret = record_one_subtree_extent(trans, root, child_bytenr, 8719 ret = btrfs_qgroup_insert_dirty_extent(trans,
8736 root->nodesize); 8720 root->fs_info, child_bytenr,
8721 root->nodesize, GFP_NOFS);
8737 if (ret) 8722 if (ret)
8738 goto out; 8723 goto out;
8739 } 8724 }
@@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root,
9906 } else { 9891 } else {
9907 ret = 0; 9892 ret = 0;
9908 } 9893 }
9894 free_extent_map(em);
9909 goto out; 9895 goto out;
9910 } 9896 }
9911 path->slots[0]++; 9897 path->slots[0]++;
@@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9942 block_group->iref = 0; 9928 block_group->iref = 0;
9943 block_group->inode = NULL; 9929 block_group->inode = NULL;
9944 spin_unlock(&block_group->lock); 9930 spin_unlock(&block_group->lock);
9931 ASSERT(block_group->io_ctl.inode == NULL);
9945 iput(inode); 9932 iput(inode);
9946 last = block_group->key.objectid + block_group->key.offset; 9933 last = block_group->key.objectid + block_group->key.offset;
9947 btrfs_put_block_group(block_group); 9934 btrfs_put_block_group(block_group);
@@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
9999 free_excluded_extents(info->extent_root, block_group); 9986 free_excluded_extents(info->extent_root, block_group);
10000 9987
10001 btrfs_remove_free_space_cache(block_group); 9988 btrfs_remove_free_space_cache(block_group);
9989 ASSERT(list_empty(&block_group->dirty_list));
9990 ASSERT(list_empty(&block_group->io_list));
9991 ASSERT(list_empty(&block_group->bg_list));
9992 ASSERT(atomic_read(&block_group->count) == 1);
10002 btrfs_put_block_group(block_group); 9993 btrfs_put_block_group(block_group);
10003 9994
10004 spin_lock(&info->block_group_cache_lock); 9995 spin_lock(&info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a7612d..28cd88fccc7e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
20#define EXTENT_DAMAGED (1U << 14) 20#define EXTENT_DAMAGED (1U << 14)
21#define EXTENT_NORESERVE (1U << 15) 21#define EXTENT_NORESERVE (1U << 15)
22#define EXTENT_QGROUP_RESERVED (1U << 16) 22#define EXTENT_QGROUP_RESERVED (1U << 16)
23#define EXTENT_CLEAR_DATA_RESV (1U << 17)
23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 24#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 25#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
25 26
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9404121fd5f7..fea31a4a6e36 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2033 */ 2033 */
2034 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 2034 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2035 &BTRFS_I(inode)->runtime_flags); 2035 &BTRFS_I(inode)->runtime_flags);
2036 /*
2037 * An ordered extent might have started before and completed
2038 * already with io errors, in which case the inode was not
2039 * updated and we end up here. So check the inode's mapping
2040 * flags for any errors that might have happened while doing
2041 * writeback of file data.
2042 */
2043 ret = btrfs_inode_check_errors(inode);
2036 inode_unlock(inode); 2044 inode_unlock(inode);
2037 goto out; 2045 goto out;
2038 } 2046 }
@@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2062 } 2070 }
2063 trans->sync = true; 2071 trans->sync = true;
2064 2072
2065 btrfs_init_log_ctx(&ctx); 2073 btrfs_init_log_ctx(&ctx, inode);
2066 2074
2067 ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); 2075 ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
2068 if (ret < 0) { 2076 if (ret < 0) {
@@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,
2667 2675
2668 alloc_start = round_down(offset, blocksize); 2676 alloc_start = round_down(offset, blocksize);
2669 alloc_end = round_up(offset + len, blocksize); 2677 alloc_end = round_up(offset + len, blocksize);
2678 cur_offset = alloc_start;
2670 2679
2671 /* Make sure we aren't being give some crap mode */ 2680 /* Make sure we aren't being give some crap mode */
2672 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2681 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2759 2768
2760 /* First, check if we exceed the qgroup limit */ 2769 /* First, check if we exceed the qgroup limit */
2761 INIT_LIST_HEAD(&reserve_list); 2770 INIT_LIST_HEAD(&reserve_list);
2762 cur_offset = alloc_start;
2763 while (1) { 2771 while (1) {
2764 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2772 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2765 alloc_end - cur_offset, 0); 2773 alloc_end - cur_offset, 0);
@@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,
2786 last_byte - cur_offset); 2794 last_byte - cur_offset);
2787 if (ret < 0) 2795 if (ret < 0)
2788 break; 2796 break;
2797 } else {
2798 /*
2799 * Do not need to reserve unwritten extent for this
2800 * range, free reserved data space first, otherwise
2801 * it'll result in false ENOSPC error.
2802 */
2803 btrfs_free_reserved_data_space(inode, cur_offset,
2804 last_byte - cur_offset);
2789 } 2805 }
2790 free_extent_map(em); 2806 free_extent_map(em);
2791 cur_offset = last_byte; 2807 cur_offset = last_byte;
@@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,
2803 range->start, 2819 range->start,
2804 range->len, 1 << inode->i_blkbits, 2820 range->len, 1 << inode->i_blkbits,
2805 offset + len, &alloc_hint); 2821 offset + len, &alloc_hint);
2822 else
2823 btrfs_free_reserved_data_space(inode, range->start,
2824 range->len);
2806 list_del(&range->list); 2825 list_del(&range->list);
2807 kfree(range); 2826 kfree(range);
2808 } 2827 }
@@ -2837,18 +2856,11 @@ out_unlock:
2837 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2856 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2838 &cached_state, GFP_KERNEL); 2857 &cached_state, GFP_KERNEL);
2839out: 2858out:
2840 /*
2841 * As we waited the extent range, the data_rsv_map must be empty
2842 * in the range, as written data range will be released from it.
2843 * And for prealloacted extent, it will also be released when
2844 * its metadata is written.
2845 * So this is completely used as cleanup.
2846 */
2847 btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
2848 inode_unlock(inode); 2859 inode_unlock(inode);
2849 /* Let go of our reservation. */ 2860 /* Let go of our reservation. */
2850 btrfs_free_reserved_data_space(inode, alloc_start, 2861 if (ret != 0)
2851 alloc_end - alloc_start); 2862 btrfs_free_reserved_data_space(inode, alloc_start,
2863 alloc_end - cur_offset);
2852 return ret; 2864 return ret;
2853} 2865}
2854 2866
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index aa6fabaee72e..359ee861b5a4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,10 +495,9 @@ again:
495 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 495 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
496 prealloc, prealloc, &alloc_hint); 496 prealloc, prealloc, &alloc_hint);
497 if (ret) { 497 if (ret) {
498 btrfs_delalloc_release_space(inode, 0, prealloc); 498 btrfs_delalloc_release_metadata(inode, prealloc);
499 goto out_put; 499 goto out_put;
500 } 500 }
501 btrfs_free_reserved_data_space(inode, 0, prealloc);
502 501
503 ret = btrfs_write_out_ino_cache(root, trans, path, inode); 502 ret = btrfs_write_out_ino_cache(root, trans, path, inode);
504out_put: 503out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2f5975954ccf..e6811c42e41e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
566 PAGE_SET_WRITEBACK | 566 PAGE_SET_WRITEBACK |
567 page_error_op | 567 page_error_op |
568 PAGE_END_WRITEBACK); 568 PAGE_END_WRITEBACK);
569 btrfs_free_reserved_data_space_noquota(inode, start,
570 end - start + 1);
569 goto free_pages_out; 571 goto free_pages_out;
570 } 572 }
571 } 573 }
@@ -742,7 +744,7 @@ retry:
742 lock_extent(io_tree, async_extent->start, 744 lock_extent(io_tree, async_extent->start,
743 async_extent->start + async_extent->ram_size - 1); 745 async_extent->start + async_extent->ram_size - 1);
744 746
745 ret = btrfs_reserve_extent(root, 747 ret = btrfs_reserve_extent(root, async_extent->ram_size,
746 async_extent->compressed_size, 748 async_extent->compressed_size,
747 async_extent->compressed_size, 749 async_extent->compressed_size,
748 0, alloc_hint, &ins, 1, 1); 750 0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
969 EXTENT_DEFRAG, PAGE_UNLOCK | 971 EXTENT_DEFRAG, PAGE_UNLOCK |
970 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | 972 PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
971 PAGE_END_WRITEBACK); 973 PAGE_END_WRITEBACK);
972 974 btrfs_free_reserved_data_space_noquota(inode, start,
975 end - start + 1);
973 *nr_written = *nr_written + 976 *nr_written = *nr_written +
974 (end - start + PAGE_SIZE) / PAGE_SIZE; 977 (end - start + PAGE_SIZE) / PAGE_SIZE;
975 *page_started = 1; 978 *page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
989 unsigned long op; 992 unsigned long op;
990 993
991 cur_alloc_size = disk_num_bytes; 994 cur_alloc_size = disk_num_bytes;
992 ret = btrfs_reserve_extent(root, cur_alloc_size, 995 ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
993 root->sectorsize, 0, alloc_hint, 996 root->sectorsize, 0, alloc_hint,
994 &ins, 1, 1); 997 &ins, 1, 1);
995 if (ret < 0) 998 if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
1489 extent_clear_unlock_delalloc(inode, cur_offset, 1492 extent_clear_unlock_delalloc(inode, cur_offset,
1490 cur_offset + num_bytes - 1, 1493 cur_offset + num_bytes - 1,
1491 locked_page, EXTENT_LOCKED | 1494 locked_page, EXTENT_LOCKED |
1492 EXTENT_DELALLOC, PAGE_UNLOCK | 1495 EXTENT_DELALLOC |
1493 PAGE_SET_PRIVATE2); 1496 EXTENT_CLEAR_DATA_RESV,
1497 PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1498
1494 if (!nolock && nocow) 1499 if (!nolock && nocow)
1495 btrfs_end_write_no_snapshoting(root); 1500 btrfs_end_write_no_snapshoting(root);
1496 cur_offset = extent_end; 1501 cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1807 return; 1812 return;
1808 1813
1809 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1814 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1810 && do_list && !(state->state & EXTENT_NORESERVE)) 1815 && do_list && !(state->state & EXTENT_NORESERVE)
1816 && (*bits & (EXTENT_DO_ACCOUNTING |
1817 EXTENT_CLEAR_DATA_RESV)))
1811 btrfs_free_reserved_data_space_noquota(inode, 1818 btrfs_free_reserved_data_space_noquota(inode,
1812 state->start, len); 1819 state->start, len);
1813 1820
@@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3435 found_key.offset = 0; 3442 found_key.offset = 0;
3436 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 3443 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3437 ret = PTR_ERR_OR_ZERO(inode); 3444 ret = PTR_ERR_OR_ZERO(inode);
3438 if (ret && ret != -ESTALE) 3445 if (ret && ret != -ENOENT)
3439 goto out; 3446 goto out;
3440 3447
3441 if (ret == -ESTALE && root == root->fs_info->tree_root) { 3448 if (ret == -ENOENT && root == root->fs_info->tree_root) {
3442 struct btrfs_root *dead_root; 3449 struct btrfs_root *dead_root;
3443 struct btrfs_fs_info *fs_info = root->fs_info; 3450 struct btrfs_fs_info *fs_info = root->fs_info;
3444 int is_dead_root = 0; 3451 int is_dead_root = 0;
@@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3474 * Inode is already gone but the orphan item is still there, 3481 * Inode is already gone but the orphan item is still there,
3475 * kill the orphan item. 3482 * kill the orphan item.
3476 */ 3483 */
3477 if (ret == -ESTALE) { 3484 if (ret == -ENOENT) {
3478 trans = btrfs_start_transaction(root, 1); 3485 trans = btrfs_start_transaction(root, 1);
3479 if (IS_ERR(trans)) { 3486 if (IS_ERR(trans)) {
3480 ret = PTR_ERR(trans); 3487 ret = PTR_ERR(trans);
@@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3633/* 3640/*
3634 * read an inode from the btree into the in-memory inode 3641 * read an inode from the btree into the in-memory inode
3635 */ 3642 */
3636static void btrfs_read_locked_inode(struct inode *inode) 3643static int btrfs_read_locked_inode(struct inode *inode)
3637{ 3644{
3638 struct btrfs_path *path; 3645 struct btrfs_path *path;
3639 struct extent_buffer *leaf; 3646 struct extent_buffer *leaf;
@@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
3652 filled = true; 3659 filled = true;
3653 3660
3654 path = btrfs_alloc_path(); 3661 path = btrfs_alloc_path();
3655 if (!path) 3662 if (!path) {
3663 ret = -ENOMEM;
3656 goto make_bad; 3664 goto make_bad;
3665 }
3657 3666
3658 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 3667 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3659 3668
3660 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 3669 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3661 if (ret) 3670 if (ret) {
3671 if (ret > 0)
3672 ret = -ENOENT;
3662 goto make_bad; 3673 goto make_bad;
3674 }
3663 3675
3664 leaf = path->nodes[0]; 3676 leaf = path->nodes[0];
3665 3677
@@ -3812,11 +3824,12 @@ cache_acl:
3812 } 3824 }
3813 3825
3814 btrfs_update_iflags(inode); 3826 btrfs_update_iflags(inode);
3815 return; 3827 return 0;
3816 3828
3817make_bad: 3829make_bad:
3818 btrfs_free_path(path); 3830 btrfs_free_path(path);
3819 make_bad_inode(inode); 3831 make_bad_inode(inode);
3832 return ret;
3820} 3833}
3821 3834
3822/* 3835/*
@@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4204 int err = 0; 4217 int err = 0;
4205 struct btrfs_root *root = BTRFS_I(dir)->root; 4218 struct btrfs_root *root = BTRFS_I(dir)->root;
4206 struct btrfs_trans_handle *trans; 4219 struct btrfs_trans_handle *trans;
4220 u64 last_unlink_trans;
4207 4221
4208 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) 4222 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4209 return -ENOTEMPTY; 4223 return -ENOTEMPTY;
@@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4226 if (err) 4240 if (err)
4227 goto out; 4241 goto out;
4228 4242
4243 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4244
4229 /* now the directory is empty */ 4245 /* now the directory is empty */
4230 err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), 4246 err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
4231 dentry->d_name.name, dentry->d_name.len); 4247 dentry->d_name.name, dentry->d_name.len);
4232 if (!err) 4248 if (!err) {
4233 btrfs_i_size_write(inode, 0); 4249 btrfs_i_size_write(inode, 0);
4250 /*
4251 * Propagate the last_unlink_trans value of the deleted dir to
4252 * its parent directory. This is to prevent an unrecoverable
4253 * log tree in the case we do something like this:
4254 * 1) create dir foo
4255 * 2) create snapshot under dir foo
4256 * 3) delete the snapshot
4257 * 4) rmdir foo
4258 * 5) mkdir foo
4259 * 6) fsync foo or some file inside foo
4260 */
4261 if (last_unlink_trans >= trans->transid)
4262 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4263 }
4234out: 4264out:
4235 btrfs_end_transaction(trans, root); 4265 btrfs_end_transaction(trans, root);
4236 btrfs_btree_balance_dirty(root); 4266 btrfs_btree_balance_dirty(root);
@@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5606 return ERR_PTR(-ENOMEM); 5636 return ERR_PTR(-ENOMEM);
5607 5637
5608 if (inode->i_state & I_NEW) { 5638 if (inode->i_state & I_NEW) {
5609 btrfs_read_locked_inode(inode); 5639 int ret;
5640
5641 ret = btrfs_read_locked_inode(inode);
5610 if (!is_bad_inode(inode)) { 5642 if (!is_bad_inode(inode)) {
5611 inode_tree_add(inode); 5643 inode_tree_add(inode);
5612 unlock_new_inode(inode); 5644 unlock_new_inode(inode);
@@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5615 } else { 5647 } else {
5616 unlock_new_inode(inode); 5648 unlock_new_inode(inode);
5617 iput(inode); 5649 iput(inode);
5618 inode = ERR_PTR(-ESTALE); 5650 ASSERT(ret < 0);
5651 inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5619 } 5652 }
5620 } 5653 }
5621 5654
@@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7225 int ret; 7258 int ret;
7226 7259
7227 alloc_hint = get_extent_allocation_hint(inode, start, len); 7260 alloc_hint = get_extent_allocation_hint(inode, start, len);
7228 ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, 7261 ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
7229 alloc_hint, &ins, 1, 1); 7262 alloc_hint, &ins, 1, 1);
7230 if (ret) 7263 if (ret)
7231 return ERR_PTR(ret); 7264 return ERR_PTR(ret);
@@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7725 ret = PTR_ERR(em2); 7758 ret = PTR_ERR(em2);
7726 goto unlock_err; 7759 goto unlock_err;
7727 } 7760 }
7761 /*
7762 * For inode marked NODATACOW or extent marked PREALLOC,
7763 * use the existing or preallocated extent, so does not
7764 * need to adjust btrfs_space_info's bytes_may_use.
7765 */
7766 btrfs_free_reserved_data_space_noquota(inode,
7767 start, len);
7728 goto unlock; 7768 goto unlock;
7729 } 7769 }
7730 } 7770 }
@@ -7759,7 +7799,6 @@ unlock:
7759 i_size_write(inode, start + len); 7799 i_size_write(inode, start + len);
7760 7800
7761 adjust_dio_outstanding_extents(inode, dio_data, len); 7801 adjust_dio_outstanding_extents(inode, dio_data, len);
7762 btrfs_free_reserved_data_space(inode, start, len);
7763 WARN_ON(dio_data->reserve < len); 7802 WARN_ON(dio_data->reserve < len);
7764 dio_data->reserve -= len; 7803 dio_data->reserve -= len;
7765 dio_data->unsubmitted_oe_range_end = start + len; 7804 dio_data->unsubmitted_oe_range_end = start + len;
@@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10280 u64 last_alloc = (u64)-1; 10319 u64 last_alloc = (u64)-1;
10281 int ret = 0; 10320 int ret = 0;
10282 bool own_trans = true; 10321 bool own_trans = true;
10322 u64 end = start + num_bytes - 1;
10283 10323
10284 if (trans) 10324 if (trans)
10285 own_trans = false; 10325 own_trans = false;
@@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10301 * sized chunks. 10341 * sized chunks.
10302 */ 10342 */
10303 cur_bytes = min(cur_bytes, last_alloc); 10343 cur_bytes = min(cur_bytes, last_alloc);
10304 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 10344 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10305 *alloc_hint, &ins, 1, 0); 10345 min_size, 0, *alloc_hint, &ins, 1, 0);
10306 if (ret) { 10346 if (ret) {
10307 if (own_trans) 10347 if (own_trans)
10308 btrfs_end_transaction(trans, root); 10348 btrfs_end_transaction(trans, root);
@@ -10388,6 +10428,9 @@ next:
10388 if (own_trans) 10428 if (own_trans)
10389 btrfs_end_transaction(trans, root); 10429 btrfs_end_transaction(trans, root);
10390 } 10430 }
10431 if (cur_offset < end)
10432 btrfs_free_reserved_data_space(inode, cur_offset,
10433 end - cur_offset + 1);
10391 return ret; 10434 return ret;
10392} 10435}
10393 10436
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14ed1e9e6bc8..7fd939bfbd99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1634 int namelen; 1634 int namelen;
1635 int ret = 0; 1635 int ret = 0;
1636 1636
1637 if (!S_ISDIR(file_inode(file)->i_mode))
1638 return -ENOTDIR;
1639
1637 ret = mnt_want_write_file(file); 1640 ret = mnt_want_write_file(file);
1638 if (ret) 1641 if (ret)
1639 goto out; 1642 goto out;
@@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
1691 struct btrfs_ioctl_vol_args *vol_args; 1694 struct btrfs_ioctl_vol_args *vol_args;
1692 int ret; 1695 int ret;
1693 1696
1697 if (!S_ISDIR(file_inode(file)->i_mode))
1698 return -ENOTDIR;
1699
1694 vol_args = memdup_user(arg, sizeof(*vol_args)); 1700 vol_args = memdup_user(arg, sizeof(*vol_args));
1695 if (IS_ERR(vol_args)) 1701 if (IS_ERR(vol_args))
1696 return PTR_ERR(vol_args); 1702 return PTR_ERR(vol_args);
@@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1714 bool readonly = false; 1720 bool readonly = false;
1715 struct btrfs_qgroup_inherit *inherit = NULL; 1721 struct btrfs_qgroup_inherit *inherit = NULL;
1716 1722
1723 if (!S_ISDIR(file_inode(file)->i_mode))
1724 return -ENOTDIR;
1725
1717 vol_args = memdup_user(arg, sizeof(*vol_args)); 1726 vol_args = memdup_user(arg, sizeof(*vol_args));
1718 if (IS_ERR(vol_args)) 1727 if (IS_ERR(vol_args))
1719 return PTR_ERR(vol_args); 1728 return PTR_ERR(vol_args);
@@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2357 int ret; 2366 int ret;
2358 int err = 0; 2367 int err = 0;
2359 2368
2369 if (!S_ISDIR(dir->i_mode))
2370 return -ENOTDIR;
2371
2360 vol_args = memdup_user(arg, sizeof(*vol_args)); 2372 vol_args = memdup_user(arg, sizeof(*vol_args));
2361 if (IS_ERR(vol_args)) 2373 if (IS_ERR(vol_args))
2362 return PTR_ERR(vol_args); 2374 return PTR_ERR(vol_args);
@@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
5084 if (!capable(CAP_SYS_ADMIN)) 5096 if (!capable(CAP_SYS_ADMIN))
5085 return -EPERM; 5097 return -EPERM;
5086 5098
5087 return btrfs_qgroup_wait_for_completion(root->fs_info); 5099 return btrfs_qgroup_wait_for_completion(root->fs_info, true);
5088} 5100}
5089 5101
5090static long _btrfs_ioctl_set_received_subvol(struct file *file, 5102static long _btrfs_ioctl_set_received_subvol(struct file *file,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c18ef9d..8db2e29fdcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
995 goto out; 995 goto out;
996 fs_info->quota_enabled = 0; 996 fs_info->quota_enabled = 0;
997 fs_info->pending_quota_state = 0; 997 fs_info->pending_quota_state = 0;
998 btrfs_qgroup_wait_for_completion(fs_info); 998 btrfs_qgroup_wait_for_completion(fs_info, false);
999 spin_lock(&fs_info->qgroup_lock); 999 spin_lock(&fs_info->qgroup_lock);
1000 quota_root = fs_info->quota_root; 1000 quota_root = fs_info->quota_root;
1001 fs_info->quota_root = NULL; 1001 fs_info->quota_root = NULL;
@@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
1453 return ret; 1453 return ret;
1454} 1454}
1455 1455
1456struct btrfs_qgroup_extent_record * 1456int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
1457btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, 1457 struct btrfs_delayed_ref_root *delayed_refs,
1458 struct btrfs_delayed_ref_root *delayed_refs, 1458 struct btrfs_qgroup_extent_record *record)
1459 struct btrfs_qgroup_extent_record *record)
1460{ 1459{
1461 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1460 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
1462 struct rb_node *parent_node = NULL; 1461 struct rb_node *parent_node = NULL;
@@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
1475 else if (bytenr > entry->bytenr) 1474 else if (bytenr > entry->bytenr)
1476 p = &(*p)->rb_right; 1475 p = &(*p)->rb_right;
1477 else 1476 else
1478 return entry; 1477 return 1;
1479 } 1478 }
1480 1479
1481 rb_link_node(&record->node, parent_node, p); 1480 rb_link_node(&record->node, parent_node, p);
1482 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1481 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
1483 return NULL; 1482 return 0;
1483}
1484
1485int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
1486 struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
1487 gfp_t gfp_flag)
1488{
1489 struct btrfs_qgroup_extent_record *record;
1490 struct btrfs_delayed_ref_root *delayed_refs;
1491 int ret;
1492
1493 if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0)
1494 return 0;
1495 if (WARN_ON(trans == NULL))
1496 return -EINVAL;
1497 record = kmalloc(sizeof(*record), gfp_flag);
1498 if (!record)
1499 return -ENOMEM;
1500
1501 delayed_refs = &trans->transaction->delayed_refs;
1502 record->bytenr = bytenr;
1503 record->num_bytes = num_bytes;
1504 record->old_roots = NULL;
1505
1506 spin_lock(&delayed_refs->lock);
1507 ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
1508 record);
1509 spin_unlock(&delayed_refs->lock);
1510 if (ret > 0)
1511 kfree(record);
1512 return 0;
1484} 1513}
1485 1514
1486#define UPDATE_NEW 0 1515#define UPDATE_NEW 0
@@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2303 int err = -ENOMEM; 2332 int err = -ENOMEM;
2304 int ret = 0; 2333 int ret = 0;
2305 2334
2335 mutex_lock(&fs_info->qgroup_rescan_lock);
2336 fs_info->qgroup_rescan_running = true;
2337 mutex_unlock(&fs_info->qgroup_rescan_lock);
2338
2306 path = btrfs_alloc_path(); 2339 path = btrfs_alloc_path();
2307 if (!path) 2340 if (!path)
2308 goto out; 2341 goto out;
@@ -2369,6 +2402,9 @@ out:
2369 } 2402 }
2370 2403
2371done: 2404done:
2405 mutex_lock(&fs_info->qgroup_rescan_lock);
2406 fs_info->qgroup_rescan_running = false;
2407 mutex_unlock(&fs_info->qgroup_rescan_lock);
2372 complete_all(&fs_info->qgroup_rescan_completion); 2408 complete_all(&fs_info->qgroup_rescan_completion);
2373} 2409}
2374 2410
@@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
2487 return 0; 2523 return 0;
2488} 2524}
2489 2525
2490int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) 2526int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
2527 bool interruptible)
2491{ 2528{
2492 int running; 2529 int running;
2493 int ret = 0; 2530 int ret = 0;
2494 2531
2495 mutex_lock(&fs_info->qgroup_rescan_lock); 2532 mutex_lock(&fs_info->qgroup_rescan_lock);
2496 spin_lock(&fs_info->qgroup_lock); 2533 spin_lock(&fs_info->qgroup_lock);
2497 running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2534 running = fs_info->qgroup_rescan_running;
2498 spin_unlock(&fs_info->qgroup_lock); 2535 spin_unlock(&fs_info->qgroup_lock);
2499 mutex_unlock(&fs_info->qgroup_rescan_lock); 2536 mutex_unlock(&fs_info->qgroup_rescan_lock);
2500 2537
2501 if (running) 2538 if (!running)
2539 return 0;
2540
2541 if (interruptible)
2502 ret = wait_for_completion_interruptible( 2542 ret = wait_for_completion_interruptible(
2503 &fs_info->qgroup_rescan_completion); 2543 &fs_info->qgroup_rescan_completion);
2544 else
2545 wait_for_completion(&fs_info->qgroup_rescan_completion);
2504 2546
2505 return ret; 2547 return ret;
2506} 2548}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 710887c06aaf..1bc64c864b62 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
46 struct btrfs_fs_info *fs_info); 46 struct btrfs_fs_info *fs_info);
47int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 47int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
48void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); 48void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
49int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); 49int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
50 bool interruptible);
50int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 51int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
51 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 52 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
52int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 53int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
63struct btrfs_delayed_extent_op; 64struct btrfs_delayed_extent_op;
64int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, 65int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
65 struct btrfs_fs_info *fs_info); 66 struct btrfs_fs_info *fs_info);
66struct btrfs_qgroup_extent_record * 67/*
67btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info, 68 * Insert one dirty extent record into @delayed_refs, informing qgroup to
68 struct btrfs_delayed_ref_root *delayed_refs, 69 * account that extent at commit trans time.
69 struct btrfs_qgroup_extent_record *record); 70 *
71 * No lock version, caller must acquire delayed ref lock and allocate memory.
72 *
73 * Return 0 for success insert
74 * Return >0 for existing record, caller can free @record safely.
75 * Error is not possible
76 */
77int btrfs_qgroup_insert_dirty_extent_nolock(
78 struct btrfs_fs_info *fs_info,
79 struct btrfs_delayed_ref_root *delayed_refs,
80 struct btrfs_qgroup_extent_record *record);
81
82/*
83 * Insert one dirty extent record into @delayed_refs, informing qgroup to
84 * account that extent at commit trans time.
85 *
86 * Better encapsulated version.
87 *
88 * Return 0 if the operation is done.
89 * Return <0 for error, like memory allocation failure or invalid parameter
90 * (NULL trans)
91 */
92int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
93 struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
94 gfp_t gfp_flag);
95
70int 96int
71btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, 97btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
72 struct btrfs_fs_info *fs_info, 98 struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5aea41b4..c0c13dc6fe12 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
31#include "async-thread.h" 31#include "async-thread.h"
32#include "free-space-cache.h" 32#include "free-space-cache.h"
33#include "inode-map.h" 33#include "inode-map.h"
34#include "qgroup.h"
34 35
35/* 36/*
36 * backref_node, mapping_node and tree_block start with this 37 * backref_node, mapping_node and tree_block start with this
@@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,
3037 u64 num_bytes; 3038 u64 num_bytes;
3038 int nr = 0; 3039 int nr = 0;
3039 int ret = 0; 3040 int ret = 0;
3041 u64 prealloc_start = cluster->start - offset;
3042 u64 prealloc_end = cluster->end - offset;
3043 u64 cur_offset;
3040 3044
3041 BUG_ON(cluster->start != cluster->boundary[0]); 3045 BUG_ON(cluster->start != cluster->boundary[0]);
3042 inode_lock(inode); 3046 inode_lock(inode);
3043 3047
3044 ret = btrfs_check_data_free_space(inode, cluster->start, 3048 ret = btrfs_check_data_free_space(inode, prealloc_start,
3045 cluster->end + 1 - cluster->start); 3049 prealloc_end + 1 - prealloc_start);
3046 if (ret) 3050 if (ret)
3047 goto out; 3051 goto out;
3048 3052
3053 cur_offset = prealloc_start;
3049 while (nr < cluster->nr) { 3054 while (nr < cluster->nr) {
3050 start = cluster->boundary[nr] - offset; 3055 start = cluster->boundary[nr] - offset;
3051 if (nr + 1 < cluster->nr) 3056 if (nr + 1 < cluster->nr)
@@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,
3055 3060
3056 lock_extent(&BTRFS_I(inode)->io_tree, start, end); 3061 lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3057 num_bytes = end + 1 - start; 3062 num_bytes = end + 1 - start;
3063 if (cur_offset < start)
3064 btrfs_free_reserved_data_space(inode, cur_offset,
3065 start - cur_offset);
3058 ret = btrfs_prealloc_file_range(inode, 0, start, 3066 ret = btrfs_prealloc_file_range(inode, 0, start,
3059 num_bytes, num_bytes, 3067 num_bytes, num_bytes,
3060 end + 1, &alloc_hint); 3068 end + 1, &alloc_hint);
3069 cur_offset = end + 1;
3061 unlock_extent(&BTRFS_I(inode)->io_tree, start, end); 3070 unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3062 if (ret) 3071 if (ret)
3063 break; 3072 break;
3064 nr++; 3073 nr++;
3065 } 3074 }
3066 btrfs_free_reserved_data_space(inode, cluster->start, 3075 if (cur_offset < prealloc_end)
3067 cluster->end + 1 - cluster->start); 3076 btrfs_free_reserved_data_space(inode, cur_offset,
3077 prealloc_end + 1 - cur_offset);
3068out: 3078out:
3069 inode_unlock(inode); 3079 inode_unlock(inode);
3070 return ret; 3080 return ret;
@@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)
3916 return 0; 3926 return 0;
3917} 3927}
3918 3928
3929/*
3930 * Qgroup fixer for data chunk relocation.
3931 * The data relocation is done in the following steps
3932 * 1) Copy data extents into data reloc tree
3933 * 2) Create tree reloc tree(special snapshot) for related subvolumes
3934 * 3) Modify file extents in tree reloc tree
3935 * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
3936 *
3937 * The problem is, data and tree reloc tree are not accounted to qgroup,
3938 * and 4) will only info qgroup to track tree blocks change, not file extents
3939 * in the tree blocks.
3940 *
3941 * The good news is, related data extents are all in data reloc tree, so we
3942 * only need to info qgroup to track all file extents in data reloc tree
3943 * before commit trans.
3944 */
3945static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
3946 struct reloc_control *rc)
3947{
3948 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3949 struct inode *inode = rc->data_inode;
3950 struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
3951 struct btrfs_path *path;
3952 struct btrfs_key key;
3953 int ret = 0;
3954
3955 if (!fs_info->quota_enabled)
3956 return 0;
3957
3958 /*
3959 * Only for stage where we update data pointers the qgroup fix is
3960 * valid.
3961 * For MOVING_DATA stage, we will miss the timing of swapping tree
3962 * blocks, and won't fix it.
3963 */
3964 if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
3965 return 0;
3966
3967 path = btrfs_alloc_path();
3968 if (!path)
3969 return -ENOMEM;
3970 key.objectid = btrfs_ino(inode);
3971 key.type = BTRFS_EXTENT_DATA_KEY;
3972 key.offset = 0;
3973
3974 ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
3975 if (ret < 0)
3976 goto out;
3977
3978 lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
3979 while (1) {
3980 struct btrfs_file_extent_item *fi;
3981
3982 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3983 if (key.objectid > btrfs_ino(inode))
3984 break;
3985 if (key.type != BTRFS_EXTENT_DATA_KEY)
3986 goto next;
3987 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
3988 struct btrfs_file_extent_item);
3989 if (btrfs_file_extent_type(path->nodes[0], fi) !=
3990 BTRFS_FILE_EXTENT_REG)
3991 goto next;
3992 ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
3993 btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
3994 btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
3995 GFP_NOFS);
3996 if (ret < 0)
3997 break;
3998next:
3999 ret = btrfs_next_item(data_reloc_root, path);
4000 if (ret < 0)
4001 break;
4002 if (ret > 0) {
4003 ret = 0;
4004 break;
4005 }
4006 }
4007 unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
4008out:
4009 btrfs_free_path(path);
4010 return ret;
4011}
4012
3919static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 4013static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3920{ 4014{
3921 struct rb_root blocks = RB_ROOT; 4015 struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4196,18 @@ restart:
4102 4196
4103 /* get rid of pinned extents */ 4197 /* get rid of pinned extents */
4104 trans = btrfs_join_transaction(rc->extent_root); 4198 trans = btrfs_join_transaction(rc->extent_root);
4105 if (IS_ERR(trans)) 4199 if (IS_ERR(trans)) {
4106 err = PTR_ERR(trans); 4200 err = PTR_ERR(trans);
4107 else 4201 goto out_free;
4108 btrfs_commit_transaction(trans, rc->extent_root); 4202 }
4203 ret = qgroup_fix_relocated_data_extents(trans, rc);
4204 if (ret < 0) {
4205 btrfs_abort_transaction(trans, ret);
4206 if (!err)
4207 err = ret;
4208 goto out_free;
4209 }
4210 btrfs_commit_transaction(trans, rc->extent_root);
4109out_free: 4211out_free:
4110 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 4212 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4111 btrfs_free_path(path); 4213 btrfs_free_path(path);
@@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4468 unset_reloc_control(rc); 4570 unset_reloc_control(rc);
4469 4571
4470 trans = btrfs_join_transaction(rc->extent_root); 4572 trans = btrfs_join_transaction(rc->extent_root);
4471 if (IS_ERR(trans)) 4573 if (IS_ERR(trans)) {
4472 err = PTR_ERR(trans); 4574 err = PTR_ERR(trans);
4473 else 4575 goto out_free;
4474 err = btrfs_commit_transaction(trans, rc->extent_root); 4576 }
4577 err = qgroup_fix_relocated_data_extents(trans, rc);
4578 if (err < 0) {
4579 btrfs_abort_transaction(trans, err);
4580 goto out_free;
4581 }
4582 err = btrfs_commit_transaction(trans, rc->extent_root);
4475out_free: 4583out_free:
4476 kfree(rc); 4584 kfree(rc);
4477out: 4585out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e1830cfe..091296062456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
272 root_key.objectid = key.offset; 272 root_key.objectid = key.offset;
273 key.offset++; 273 key.offset++;
274 274
275 /*
276 * The root might have been inserted already, as before we look
277 * for orphan roots, log replay might have happened, which
278 * triggers a transaction commit and qgroup accounting, which
279 * in turn reads and inserts fs roots while doing backref
280 * walking.
281 */
282 root = btrfs_lookup_fs_root(tree_root->fs_info,
283 root_key.objectid);
284 if (root) {
285 WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
286 &root->state));
287 if (btrfs_root_refs(&root->root_item) == 0)
288 btrfs_add_dead_root(root);
289 continue;
290 }
291
275 root = btrfs_read_fs_root(tree_root, &root_key); 292 root = btrfs_read_fs_root(tree_root, &root_key);
276 err = PTR_ERR_OR_ZERO(root); 293 err = PTR_ERR_OR_ZERO(root);
277 if (err && err != -ENOENT) { 294 if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
310 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); 327 set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
311 328
312 err = btrfs_insert_fs_root(root->fs_info, root); 329 err = btrfs_insert_fs_root(root->fs_info, root);
313 /*
314 * The root might have been inserted already, as before we look
315 * for orphan roots, log replay might have happened, which
316 * triggers a transaction commit and qgroup accounting, which
317 * in turn reads and inserts fs roots while doing backref
318 * walking.
319 */
320 if (err == -EEXIST)
321 err = 0;
322 if (err) { 330 if (err) {
331 BUG_ON(err == -EEXIST);
323 btrfs_free_fs_root(root); 332 btrfs_free_fs_root(root);
324 break; 333 break;
325 } 334 }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..a87675ffd02b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
231 u64 parent_ino; 231 u64 parent_ino;
232 u64 ino; 232 u64 ino;
233 u64 gen; 233 u64 gen;
234 bool is_orphan;
235 struct list_head update_refs; 234 struct list_head update_refs;
236}; 235};
237 236
@@ -274,6 +273,39 @@ struct name_cache_entry {
274 char name[]; 273 char name[];
275}; 274};
276 275
276static void inconsistent_snapshot_error(struct send_ctx *sctx,
277 enum btrfs_compare_tree_result result,
278 const char *what)
279{
280 const char *result_string;
281
282 switch (result) {
283 case BTRFS_COMPARE_TREE_NEW:
284 result_string = "new";
285 break;
286 case BTRFS_COMPARE_TREE_DELETED:
287 result_string = "deleted";
288 break;
289 case BTRFS_COMPARE_TREE_CHANGED:
290 result_string = "updated";
291 break;
292 case BTRFS_COMPARE_TREE_SAME:
293 ASSERT(0);
294 result_string = "unchanged";
295 break;
296 default:
297 ASSERT(0);
298 result_string = "unexpected";
299 }
300
301 btrfs_err(sctx->send_root->fs_info,
302 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
303 result_string, what, sctx->cmp_key->objectid,
304 sctx->send_root->root_key.objectid,
305 (sctx->parent_root ?
306 sctx->parent_root->root_key.objectid : 0));
307}
308
277static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); 309static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
278 310
279static struct waiting_dir_move * 311static struct waiting_dir_move *
@@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1861 * was already unlinked/moved, so we can safely assume that we will not 1893 * was already unlinked/moved, so we can safely assume that we will not
1862 * overwrite anything at this point in time. 1894 * overwrite anything at this point in time.
1863 */ 1895 */
1864 if (other_inode > sctx->send_progress) { 1896 if (other_inode > sctx->send_progress ||
1897 is_waiting_for_move(sctx, other_inode)) {
1865 ret = get_inode_info(sctx->parent_root, other_inode, NULL, 1898 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1866 who_gen, NULL, NULL, NULL, NULL); 1899 who_gen, NULL, NULL, NULL, NULL);
1867 if (ret < 0) 1900 if (ret < 0)
@@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
2502 key.type = BTRFS_INODE_ITEM_KEY; 2535 key.type = BTRFS_INODE_ITEM_KEY;
2503 key.offset = 0; 2536 key.offset = 0;
2504 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); 2537 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2538 if (ret > 0)
2539 ret = -ENOENT;
2505 if (ret < 0) 2540 if (ret < 0)
2506 goto out; 2541 goto out;
2507 2542
@@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2947 } 2982 }
2948 2983
2949 if (loc.objectid > send_progress) { 2984 if (loc.objectid > send_progress) {
2985 struct orphan_dir_info *odi;
2986
2987 odi = get_orphan_dir_info(sctx, dir);
2988 free_orphan_dir_info(sctx, odi);
2950 ret = 0; 2989 ret = 0;
2951 goto out; 2990 goto out;
2952 } 2991 }
@@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
3047 pm->parent_ino = parent_ino; 3086 pm->parent_ino = parent_ino;
3048 pm->ino = ino; 3087 pm->ino = ino;
3049 pm->gen = ino_gen; 3088 pm->gen = ino_gen;
3050 pm->is_orphan = is_orphan;
3051 INIT_LIST_HEAD(&pm->list); 3089 INIT_LIST_HEAD(&pm->list);
3052 INIT_LIST_HEAD(&pm->update_refs); 3090 INIT_LIST_HEAD(&pm->update_refs);
3053 RB_CLEAR_NODE(&pm->node); 3091 RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3113 return NULL; 3151 return NULL;
3114} 3152}
3115 3153
3154static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3155 u64 ino, u64 gen, u64 *ancestor_ino)
3156{
3157 int ret = 0;
3158 u64 parent_inode = 0;
3159 u64 parent_gen = 0;
3160 u64 start_ino = ino;
3161
3162 *ancestor_ino = 0;
3163 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3164 fs_path_reset(name);
3165
3166 if (is_waiting_for_rm(sctx, ino))
3167 break;
3168 if (is_waiting_for_move(sctx, ino)) {
3169 if (*ancestor_ino == 0)
3170 *ancestor_ino = ino;
3171 ret = get_first_ref(sctx->parent_root, ino,
3172 &parent_inode, &parent_gen, name);
3173 } else {
3174 ret = __get_cur_name_and_parent(sctx, ino, gen,
3175 &parent_inode,
3176 &parent_gen, name);
3177 if (ret > 0) {
3178 ret = 0;
3179 break;
3180 }
3181 }
3182 if (ret < 0)
3183 break;
3184 if (parent_inode == start_ino) {
3185 ret = 1;
3186 if (*ancestor_ino == 0)
3187 *ancestor_ino = ino;
3188 break;
3189 }
3190 ino = parent_inode;
3191 gen = parent_gen;
3192 }
3193 return ret;
3194}
3195
3116static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3196static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3117{ 3197{
3118 struct fs_path *from_path = NULL; 3198 struct fs_path *from_path = NULL;
@@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3123 u64 parent_ino, parent_gen; 3203 u64 parent_ino, parent_gen;
3124 struct waiting_dir_move *dm = NULL; 3204 struct waiting_dir_move *dm = NULL;
3125 u64 rmdir_ino = 0; 3205 u64 rmdir_ino = 0;
3206 u64 ancestor;
3207 bool is_orphan;
3126 int ret; 3208 int ret;
3127 3209
3128 name = fs_path_alloc(); 3210 name = fs_path_alloc();
@@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3135 dm = get_waiting_dir_move(sctx, pm->ino); 3217 dm = get_waiting_dir_move(sctx, pm->ino);
3136 ASSERT(dm); 3218 ASSERT(dm);
3137 rmdir_ino = dm->rmdir_ino; 3219 rmdir_ino = dm->rmdir_ino;
3220 is_orphan = dm->orphanized;
3138 free_waiting_dir_move(sctx, dm); 3221 free_waiting_dir_move(sctx, dm);
3139 3222
3140 if (pm->is_orphan) { 3223 if (is_orphan) {
3141 ret = gen_unique_name(sctx, pm->ino, 3224 ret = gen_unique_name(sctx, pm->ino,
3142 pm->gen, from_path); 3225 pm->gen, from_path);
3143 } else { 3226 } else {
@@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3155 goto out; 3238 goto out;
3156 3239
3157 sctx->send_progress = sctx->cur_ino + 1; 3240 sctx->send_progress = sctx->cur_ino + 1;
3241 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3242 if (ret < 0)
3243 goto out;
3244 if (ret) {
3245 LIST_HEAD(deleted_refs);
3246 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3247 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3248 &pm->update_refs, &deleted_refs,
3249 is_orphan);
3250 if (ret < 0)
3251 goto out;
3252 if (rmdir_ino) {
3253 dm = get_waiting_dir_move(sctx, pm->ino);
3254 ASSERT(dm);
3255 dm->rmdir_ino = rmdir_ino;
3256 }
3257 goto out;
3258 }
3158 fs_path_reset(name); 3259 fs_path_reset(name);
3159 to_path = name; 3260 to_path = name;
3160 name = NULL; 3261 name = NULL;
@@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3174 /* already deleted */ 3275 /* already deleted */
3175 goto finish; 3276 goto finish;
3176 } 3277 }
3177 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); 3278 ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
3178 if (ret < 0) 3279 if (ret < 0)
3179 goto out; 3280 goto out;
3180 if (!ret) 3281 if (!ret)
@@ -3204,8 +3305,18 @@ finish:
3204 * and old parent(s). 3305 * and old parent(s).
3205 */ 3306 */
3206 list_for_each_entry(cur, &pm->update_refs, list) { 3307 list_for_each_entry(cur, &pm->update_refs, list) {
3207 if (cur->dir == rmdir_ino) 3308 /*
3309 * The parent inode might have been deleted in the send snapshot
3310 */
3311 ret = get_inode_info(sctx->send_root, cur->dir, NULL,
3312 NULL, NULL, NULL, NULL, NULL);
3313 if (ret == -ENOENT) {
3314 ret = 0;
3208 continue; 3315 continue;
3316 }
3317 if (ret < 0)
3318 goto out;
3319
3209 ret = send_utimes(sctx, cur->dir, cur->dir_gen); 3320 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3210 if (ret < 0) 3321 if (ret < 0)
3211 goto out; 3322 goto out;
@@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
3325 u64 left_gen; 3436 u64 left_gen;
3326 u64 right_gen; 3437 u64 right_gen;
3327 int ret = 0; 3438 int ret = 0;
3439 struct waiting_dir_move *wdm;
3328 3440
3329 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) 3441 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3330 return 0; 3442 return 0;
@@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
3383 goto out; 3495 goto out;
3384 } 3496 }
3385 3497
3386 if (is_waiting_for_move(sctx, di_key.objectid)) { 3498 wdm = get_waiting_dir_move(sctx, di_key.objectid);
3499 if (wdm && !wdm->orphanized) {
3387 ret = add_pending_dir_move(sctx, 3500 ret = add_pending_dir_move(sctx,
3388 sctx->cur_ino, 3501 sctx->cur_ino,
3389 sctx->cur_inode_gen, 3502 sctx->cur_inode_gen,
@@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3470 ret = is_ancestor(sctx->parent_root, 3583 ret = is_ancestor(sctx->parent_root,
3471 sctx->cur_ino, sctx->cur_inode_gen, 3584 sctx->cur_ino, sctx->cur_inode_gen,
3472 ino, path_before); 3585 ino, path_before);
3473 break; 3586 if (ret)
3587 break;
3474 } 3588 }
3475 3589
3476 fs_path_reset(path_before); 3590 fs_path_reset(path_before);
@@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3643 goto out; 3757 goto out;
3644 if (ret) { 3758 if (ret) {
3645 struct name_cache_entry *nce; 3759 struct name_cache_entry *nce;
3760 struct waiting_dir_move *wdm;
3646 3761
3647 ret = orphanize_inode(sctx, ow_inode, ow_gen, 3762 ret = orphanize_inode(sctx, ow_inode, ow_gen,
3648 cur->full_path); 3763 cur->full_path);
3649 if (ret < 0) 3764 if (ret < 0)
3650 goto out; 3765 goto out;
3766
3767 /*
3768 * If ow_inode has its rename operation delayed
3769 * make sure that its orphanized name is used in
3770 * the source path when performing its rename
3771 * operation.
3772 */
3773 if (is_waiting_for_move(sctx, ow_inode)) {
3774 wdm = get_waiting_dir_move(sctx,
3775 ow_inode);
3776 ASSERT(wdm);
3777 wdm->orphanized = true;
3778 }
3779
3651 /* 3780 /*
3652 * Make sure we clear our orphanized inode's 3781 * Make sure we clear our orphanized inode's
3653 * name from the name cache. This is because the 3782 * name from the name cache. This is because the
@@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3663 name_cache_delete(sctx, nce); 3792 name_cache_delete(sctx, nce);
3664 kfree(nce); 3793 kfree(nce);
3665 } 3794 }
3795
3796 /*
3797 * ow_inode might currently be an ancestor of
3798 * cur_ino, therefore compute valid_path (the
3799 * current path of cur_ino) again because it
3800 * might contain the pre-orphanization name of
3801 * ow_inode, which is no longer valid.
3802 */
3803 fs_path_reset(valid_path);
3804 ret = get_cur_path(sctx, sctx->cur_ino,
3805 sctx->cur_inode_gen, valid_path);
3806 if (ret < 0)
3807 goto out;
3666 } else { 3808 } else {
3667 ret = send_unlink(sctx, cur->full_path); 3809 ret = send_unlink(sctx, cur->full_path);
3668 if (ret < 0) 3810 if (ret < 0)
@@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx,
4126 } 4268 }
4127 btrfs_release_path(path); 4269 btrfs_release_path(path);
4128 4270
4271 /*
4272 * We don't actually care about pending_move as we are simply
4273 * re-creating this inode and will be rename'ing it into place once we
4274 * rename the parent directory.
4275 */
4129 ret = process_recorded_refs(sctx, &pending_move); 4276 ret = process_recorded_refs(sctx, &pending_move);
4130 /* Only applicable to an incremental send. */
4131 ASSERT(pending_move == 0);
4132
4133out: 4277out:
4134 btrfs_free_path(path); 4278 btrfs_free_path(path);
4135 return ret; 4279 return ret;
@@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx,
5602{ 5746{
5603 int ret = 0; 5747 int ret = 0;
5604 5748
5605 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); 5749 if (sctx->cur_ino != sctx->cmp_key->objectid) {
5750 inconsistent_snapshot_error(sctx, result, "reference");
5751 return -EIO;
5752 }
5606 5753
5607 if (!sctx->cur_inode_new_gen && 5754 if (!sctx->cur_inode_new_gen &&
5608 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { 5755 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx,
5627{ 5774{
5628 int ret = 0; 5775 int ret = 0;
5629 5776
5630 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); 5777 if (sctx->cur_ino != sctx->cmp_key->objectid) {
5778 inconsistent_snapshot_error(sctx, result, "xattr");
5779 return -EIO;
5780 }
5631 5781
5632 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 5782 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
5633 if (result == BTRFS_COMPARE_TREE_NEW) 5783 if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx,
5651{ 5801{
5652 int ret = 0; 5802 int ret = 0;
5653 5803
5654 BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); 5804 if (sctx->cur_ino != sctx->cmp_key->objectid) {
5805 inconsistent_snapshot_error(sctx, result, "extent");
5806 return -EIO;
5807 }
5655 5808
5656 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { 5809 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
5657 if (result != BTRFS_COMPARE_TREE_DELETED) 5810 if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce334f696..4071fe2bd098 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)
2241 struct btrfs_trans_handle *trans; 2241 struct btrfs_trans_handle *trans;
2242 struct btrfs_root *root = btrfs_sb(sb)->tree_root; 2242 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
2243 2243
2244 root->fs_info->fs_frozen = 1;
2245 /*
2246 * We don't need a barrier here, we'll wait for any transaction that
2247 * could be in progress on other threads (and do delayed iputs that
2248 * we want to avoid on a frozen filesystem), or do the commit
2249 * ourselves.
2250 */
2244 trans = btrfs_attach_transaction_barrier(root); 2251 trans = btrfs_attach_transaction_barrier(root);
2245 if (IS_ERR(trans)) { 2252 if (IS_ERR(trans)) {
2246 /* no transaction, don't bother */ 2253 /* no transaction, don't bother */
@@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)
2251 return btrfs_commit_transaction(trans, root); 2258 return btrfs_commit_transaction(trans, root);
2252} 2259}
2253 2260
2261static int btrfs_unfreeze(struct super_block *sb)
2262{
2263 struct btrfs_root *root = btrfs_sb(sb)->tree_root;
2264
2265 root->fs_info->fs_frozen = 0;
2266 return 0;
2267}
2268
2254static int btrfs_show_devname(struct seq_file *m, struct dentry *root) 2269static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
2255{ 2270{
2256 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 2271 struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {
2299 .statfs = btrfs_statfs, 2314 .statfs = btrfs_statfs,
2300 .remount_fs = btrfs_remount, 2315 .remount_fs = btrfs_remount,
2301 .freeze_fs = btrfs_freeze, 2316 .freeze_fs = btrfs_freeze,
2317 .unfreeze_fs = btrfs_unfreeze,
2302}; 2318};
2303 2319
2304static const struct file_operations btrfs_ctl_fops = { 2320static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a721961..95d41919d034 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2278 2278
2279 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2279 kmem_cache_free(btrfs_trans_handle_cachep, trans);
2280 2280
2281 /*
2282 * If fs has been frozen, we can not handle delayed iputs, otherwise
2283 * it'll result in deadlock about SB_FREEZE_FS.
2284 */
2281 if (current != root->fs_info->transaction_kthread && 2285 if (current != root->fs_info->transaction_kthread &&
2282 current != root->fs_info->cleaner_kthread) 2286 current != root->fs_info->cleaner_kthread &&
2287 !root->fs_info->fs_frozen)
2283 btrfs_run_delayed_iputs(root); 2288 btrfs_run_delayed_iputs(root);
2284 2289
2285 return ret; 2290 return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4f56be..ef9c55bc7907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
27#include "backref.h" 27#include "backref.h"
28#include "hash.h" 28#include "hash.h"
29#include "compression.h" 29#include "compression.h"
30#include "qgroup.h"
30 31
31/* magic values for the inode_only field in btrfs_log_inode: 32/* magic values for the inode_only field in btrfs_log_inode:
32 * 33 *
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
680 ins.type = BTRFS_EXTENT_ITEM_KEY; 681 ins.type = BTRFS_EXTENT_ITEM_KEY;
681 offset = key->offset - btrfs_file_extent_offset(eb, item); 682 offset = key->offset - btrfs_file_extent_offset(eb, item);
682 683
684 /*
685 * Manually record dirty extent, as here we did a shallow
686 * file extent item copy and skip normal backref update,
687 * but modifying extent tree all by ourselves.
688 * So need to manually record dirty extent for qgroup,
689 * as the owner of the file extent changed from log tree
690 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
691 */
692 ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
693 btrfs_file_extent_disk_bytenr(eb, item),
694 btrfs_file_extent_disk_num_bytes(eb, item),
695 GFP_NOFS);
696 if (ret < 0)
697 goto out;
698
683 if (ins.objectid > 0) { 699 if (ins.objectid > 0) {
684 u64 csum_start; 700 u64 csum_start;
685 u64 csum_end; 701 u64 csum_end;
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2807 */ 2823 */
2808 mutex_unlock(&root->log_mutex); 2824 mutex_unlock(&root->log_mutex);
2809 2825
2810 btrfs_init_log_ctx(&root_log_ctx); 2826 btrfs_init_log_ctx(&root_log_ctx, NULL);
2811 2827
2812 mutex_lock(&log_root_tree->log_mutex); 2828 mutex_lock(&log_root_tree->log_mutex);
2813 atomic_inc(&log_root_tree->log_batch); 2829 atomic_inc(&log_root_tree->log_batch);
@@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2851 2867
2852 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { 2868 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
2853 blk_finish_plug(&plug); 2869 blk_finish_plug(&plug);
2870 list_del_init(&root_log_ctx.list);
2854 mutex_unlock(&log_root_tree->log_mutex); 2871 mutex_unlock(&log_root_tree->log_mutex);
2855 ret = root_log_ctx.log_ret; 2872 ret = root_log_ctx.log_ret;
2856 goto out; 2873 goto out;
@@ -4469,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4469static int btrfs_check_ref_name_override(struct extent_buffer *eb, 4486static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4470 const int slot, 4487 const int slot,
4471 const struct btrfs_key *key, 4488 const struct btrfs_key *key,
4472 struct inode *inode) 4489 struct inode *inode,
4490 u64 *other_ino)
4473{ 4491{
4474 int ret; 4492 int ret;
4475 struct btrfs_path *search_path; 4493 struct btrfs_path *search_path;
@@ -4528,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4528 search_path, parent, 4546 search_path, parent,
4529 name, this_name_len, 0); 4547 name, this_name_len, 0);
4530 if (di && !IS_ERR(di)) { 4548 if (di && !IS_ERR(di)) {
4531 ret = 1; 4549 struct btrfs_key di_key;
4550
4551 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4552 di, &di_key);
4553 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4554 ret = 1;
4555 *other_ino = di_key.objectid;
4556 } else {
4557 ret = -EAGAIN;
4558 }
4532 goto out; 4559 goto out;
4533 } else if (IS_ERR(di)) { 4560 } else if (IS_ERR(di)) {
4534 ret = PTR_ERR(di); 4561 ret = PTR_ERR(di);
@@ -4722,16 +4749,72 @@ again:
4722 if ((min_key.type == BTRFS_INODE_REF_KEY || 4749 if ((min_key.type == BTRFS_INODE_REF_KEY ||
4723 min_key.type == BTRFS_INODE_EXTREF_KEY) && 4750 min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4724 BTRFS_I(inode)->generation == trans->transid) { 4751 BTRFS_I(inode)->generation == trans->transid) {
4752 u64 other_ino = 0;
4753
4725 ret = btrfs_check_ref_name_override(path->nodes[0], 4754 ret = btrfs_check_ref_name_override(path->nodes[0],
4726 path->slots[0], 4755 path->slots[0],
4727 &min_key, inode); 4756 &min_key, inode,
4757 &other_ino);
4728 if (ret < 0) { 4758 if (ret < 0) {
4729 err = ret; 4759 err = ret;
4730 goto out_unlock; 4760 goto out_unlock;
4731 } else if (ret > 0) { 4761 } else if (ret > 0 && ctx &&
4732 err = 1; 4762 other_ino != btrfs_ino(ctx->inode)) {
4733 btrfs_set_log_full_commit(root->fs_info, trans); 4763 struct btrfs_key inode_key;
4734 goto out_unlock; 4764 struct inode *other_inode;
4765
4766 if (ins_nr > 0) {
4767 ins_nr++;
4768 } else {
4769 ins_nr = 1;
4770 ins_start_slot = path->slots[0];
4771 }
4772 ret = copy_items(trans, inode, dst_path, path,
4773 &last_extent, ins_start_slot,
4774 ins_nr, inode_only,
4775 logged_isize);
4776 if (ret < 0) {
4777 err = ret;
4778 goto out_unlock;
4779 }
4780 ins_nr = 0;
4781 btrfs_release_path(path);
4782 inode_key.objectid = other_ino;
4783 inode_key.type = BTRFS_INODE_ITEM_KEY;
4784 inode_key.offset = 0;
4785 other_inode = btrfs_iget(root->fs_info->sb,
4786 &inode_key, root,
4787 NULL);
4788 /*
4789 * If the other inode that had a conflicting dir
4790 * entry was deleted in the current transaction,
4791 * we don't need to do more work nor fallback to
4792 * a transaction commit.
4793 */
4794 if (IS_ERR(other_inode) &&
4795 PTR_ERR(other_inode) == -ENOENT) {
4796 goto next_key;
4797 } else if (IS_ERR(other_inode)) {
4798 err = PTR_ERR(other_inode);
4799 goto out_unlock;
4800 }
4801 /*
4802 * We are safe logging the other inode without
4803 * acquiring its i_mutex as long as we log with
4804 * the LOG_INODE_EXISTS mode. We're safe against
4805 * concurrent renames of the other inode as well
4806 * because during a rename we pin the log and
4807 * update the log with the new name before we
4808 * unpin it.
4809 */
4810 err = btrfs_log_inode(trans, root, other_inode,
4811 LOG_INODE_EXISTS,
4812 0, LLONG_MAX, ctx);
4813 iput(other_inode);
4814 if (err)
4815 goto out_unlock;
4816 else
4817 goto next_key;
4735 } 4818 }
4736 } 4819 }
4737 4820
@@ -4799,7 +4882,7 @@ next_slot:
4799 ins_nr = 0; 4882 ins_nr = 0;
4800 } 4883 }
4801 btrfs_release_path(path); 4884 btrfs_release_path(path);
4802 4885next_key:
4803 if (min_key.offset < (u64)-1) { 4886 if (min_key.offset < (u64)-1) {
4804 min_key.offset++; 4887 min_key.offset++;
4805 } else if (min_key.type < max_key.type) { 4888 } else if (min_key.type < max_key.type) {
@@ -4993,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
4993 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) 5076 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
4994 break; 5077 break;
4995 5078
4996 if (IS_ROOT(parent)) 5079 if (IS_ROOT(parent)) {
5080 inode = d_inode(parent);
5081 if (btrfs_must_commit_transaction(trans, inode))
5082 ret = 1;
4997 break; 5083 break;
5084 }
4998 5085
4999 parent = dget_parent(parent); 5086 parent = dget_parent(parent);
5000 dput(old_parent); 5087 dput(old_parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a9f1b75d080d..ab858e31ccbc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
30 int log_transid; 30 int log_transid;
31 int io_err; 31 int io_err;
32 bool log_new_dentries; 32 bool log_new_dentries;
33 struct inode *inode;
33 struct list_head list; 34 struct list_head list;
34}; 35};
35 36
36static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) 37static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
38 struct inode *inode)
37{ 39{
38 ctx->log_ret = 0; 40 ctx->log_ret = 0;
39 ctx->log_transid = 0; 41 ctx->log_transid = 0;
40 ctx->io_err = 0; 42 ctx->io_err = 0;
41 ctx->log_new_dentries = false; 43 ctx->log_new_dentries = false;
44 ctx->inode = inode;
42 INIT_LIST_HEAD(&ctx->list); 45 INIT_LIST_HEAD(&ctx->list);
43} 46}
44 47
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f125508771..035efce603a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)
834 struct btrfs_device *device; 834 struct btrfs_device *device;
835 835
836 device = container_of(work, struct btrfs_device, rcu_work); 836 device = container_of(work, struct btrfs_device, rcu_work);
837
838 if (device->bdev)
839 blkdev_put(device->bdev, device->mode);
840
841 rcu_string_free(device->name); 837 rcu_string_free(device->name);
842 kfree(device); 838 kfree(device);
843} 839}
@@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)
852 schedule_work(&device->rcu_work); 848 schedule_work(&device->rcu_work);
853} 849}
854 850
851static void btrfs_close_bdev(struct btrfs_device *device)
852{
853 if (device->bdev && device->writeable) {
854 sync_blockdev(device->bdev);
855 invalidate_bdev(device->bdev);
856 }
857
858 if (device->bdev)
859 blkdev_put(device->bdev, device->mode);
860}
861
855static void btrfs_close_one_device(struct btrfs_device *device) 862static void btrfs_close_one_device(struct btrfs_device *device)
856{ 863{
857 struct btrfs_fs_devices *fs_devices = device->fs_devices; 864 struct btrfs_fs_devices *fs_devices = device->fs_devices;
@@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
870 if (device->missing) 877 if (device->missing)
871 fs_devices->missing_devices--; 878 fs_devices->missing_devices--;
872 879
873 if (device->bdev && device->writeable) { 880 btrfs_close_bdev(device);
874 sync_blockdev(device->bdev);
875 invalidate_bdev(device->bdev);
876 }
877 881
878 new_device = btrfs_alloc_device(NULL, &device->devid, 882 new_device = btrfs_alloc_device(NULL, &device->devid,
879 device->uuid); 883 device->uuid);
@@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
1932 btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); 1936 btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
1933 } 1937 }
1934 1938
1939 btrfs_close_bdev(device);
1940
1935 call_rcu(&device->rcu, free_device); 1941 call_rcu(&device->rcu, free_device);
1936 1942
1937 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; 1943 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2025 /* zero out the old super if it is writable */ 2031 /* zero out the old super if it is writable */
2026 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); 2032 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2027 } 2033 }
2034
2035 btrfs_close_bdev(srcdev);
2036
2028 call_rcu(&srcdev->rcu, free_device); 2037 call_rcu(&srcdev->rcu, free_device);
2029 2038
2030 /* 2039 /*
@@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2080 * the device_list_mutex lock. 2089 * the device_list_mutex lock.
2081 */ 2090 */
2082 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); 2091 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2092
2093 btrfs_close_bdev(tgtdev);
2083 call_rcu(&tgtdev->rcu, free_device); 2094 call_rcu(&tgtdev->rcu, free_device);
2084} 2095}
2085 2096
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 99115cae1652..16e6ded0b7f2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
1347{ 1347{
1348 struct inode *inode = &ci->vfs_inode; 1348 struct inode *inode = &ci->vfs_inode;
1349 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1349 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1350 struct ceph_mds_session *session = *psession; 1350 struct ceph_mds_session *session = NULL;
1351 int mds; 1351 int mds;
1352
1352 dout("ceph_flush_snaps %p\n", inode); 1353 dout("ceph_flush_snaps %p\n", inode);
1354 if (psession)
1355 session = *psession;
1353retry: 1356retry:
1354 spin_lock(&ci->i_ceph_lock); 1357 spin_lock(&ci->i_ceph_lock);
1355 if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { 1358 if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c64a0b794d49..df4b3e6fa563 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
597 if (is_hash_order(new_pos)) { 597 if (is_hash_order(new_pos)) {
598 /* no need to reset last_name for a forward seek when 598 /* no need to reset last_name for a forward seek when
599 * dentries are sotred in hash order */ 599 * dentries are sotred in hash order */
600 } else if (fi->frag |= fpos_frag(new_pos)) { 600 } else if (fi->frag != fpos_frag(new_pos)) {
601 return true; 601 return true;
602 } 602 }
603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa59a85226b2..f72d4ae303b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2759 } else { 2759 } else {
2760 path = NULL; 2760 path = NULL;
2761 pathlen = 0; 2761 pathlen = 0;
2762 pathbase = 0;
2762 } 2763 }
2763 2764
2764 spin_lock(&ci->i_ceph_lock); 2765 spin_lock(&ci->i_ceph_lock);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6bbec5e784cd..14ae4b8e1a3c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
609 char *s, *p; 609 char *s, *p;
610 char sep; 610 char sep;
611 611
612 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
613 return dget(sb->s_root);
614
612 full_path = cifs_build_path_to_root(vol, cifs_sb, 615 full_path = cifs_build_path_to_root(vol, cifs_sb,
613 cifs_sb_master_tcon(cifs_sb)); 616 cifs_sb_master_tcon(cifs_sb));
614 if (full_path == NULL) 617 if (full_path == NULL)
@@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type,
686 cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); 689 cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
687 if (cifs_sb->mountdata == NULL) { 690 if (cifs_sb->mountdata == NULL) {
688 root = ERR_PTR(-ENOMEM); 691 root = ERR_PTR(-ENOMEM);
689 goto out_cifs_sb; 692 goto out_free;
690 } 693 }
691 694
692 if (volume_info->prepath) { 695 rc = cifs_setup_cifs_sb(volume_info, cifs_sb);
693 cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL); 696 if (rc) {
694 if (cifs_sb->prepath == NULL) { 697 root = ERR_PTR(rc);
695 root = ERR_PTR(-ENOMEM); 698 goto out_free;
696 goto out_cifs_sb;
697 }
698 } 699 }
699 700
700 cifs_setup_cifs_sb(volume_info, cifs_sb);
701
702 rc = cifs_mount(cifs_sb, volume_info); 701 rc = cifs_mount(cifs_sb, volume_info);
703 if (rc) { 702 if (rc) {
704 if (!(flags & MS_SILENT)) 703 if (!(flags & MS_SILENT))
705 cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", 704 cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
706 rc); 705 rc);
707 root = ERR_PTR(rc); 706 root = ERR_PTR(rc);
708 goto out_mountdata; 707 goto out_free;
709 } 708 }
710 709
711 mnt_data.vol = volume_info; 710 mnt_data.vol = volume_info;
@@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type,
735 sb->s_flags |= MS_ACTIVE; 734 sb->s_flags |= MS_ACTIVE;
736 } 735 }
737 736
738 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) 737 root = cifs_get_root(volume_info, sb);
739 root = dget(sb->s_root);
740 else
741 root = cifs_get_root(volume_info, sb);
742
743 if (IS_ERR(root)) 738 if (IS_ERR(root))
744 goto out_super; 739 goto out_super;
745 740
@@ -752,9 +747,9 @@ out:
752 cifs_cleanup_volume_info(volume_info); 747 cifs_cleanup_volume_info(volume_info);
753 return root; 748 return root;
754 749
755out_mountdata: 750out_free:
751 kfree(cifs_sb->prepath);
756 kfree(cifs_sb->mountdata); 752 kfree(cifs_sb->mountdata);
757out_cifs_sb:
758 kfree(cifs_sb); 753 kfree(cifs_sb);
759out_nls: 754out_nls:
760 unload_nls(volume_info->local_nls); 755 unload_nls(volume_info->local_nls);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1243bd326591..95dab43646f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
184 unsigned int to_read); 184 unsigned int to_read);
185extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, 185extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
186 struct page *page, unsigned int to_read); 186 struct page *page, unsigned int to_read);
187extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, 187extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
188 struct cifs_sb_info *cifs_sb); 188 struct cifs_sb_info *cifs_sb);
189extern int cifs_match_super(struct super_block *, void *); 189extern int cifs_match_super(struct super_block *, void *);
190extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); 190extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae03283bd61..2e4f4bad8b1e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
2781 return 1; 2781 return 1;
2782} 2782}
2783 2783
2784static int
2785match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
2786{
2787 struct cifs_sb_info *old = CIFS_SB(sb);
2788 struct cifs_sb_info *new = mnt_data->cifs_sb;
2789
2790 if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
2791 if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
2792 return 0;
2793 /* The prepath should be null terminated strings */
2794 if (strcmp(new->prepath, old->prepath))
2795 return 0;
2796
2797 return 1;
2798 }
2799 return 0;
2800}
2801
2784int 2802int
2785cifs_match_super(struct super_block *sb, void *data) 2803cifs_match_super(struct super_block *sb, void *data)
2786{ 2804{
@@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data)
2808 2826
2809 if (!match_server(tcp_srv, volume_info) || 2827 if (!match_server(tcp_srv, volume_info) ||
2810 !match_session(ses, volume_info) || 2828 !match_session(ses, volume_info) ||
2811 !match_tcon(tcon, volume_info->UNC)) { 2829 !match_tcon(tcon, volume_info->UNC) ||
2830 !match_prepath(sb, mnt_data)) {
2812 rc = 0; 2831 rc = 0;
2813 goto out; 2832 goto out;
2814 } 2833 }
@@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
3222 } 3241 }
3223} 3242}
3224 3243
3225void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, 3244int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
3226 struct cifs_sb_info *cifs_sb) 3245 struct cifs_sb_info *cifs_sb)
3227{ 3246{
3228 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); 3247 INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
3316 3335
3317 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 3336 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
3318 cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n"); 3337 cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
3338
3339 if (pvolume_info->prepath) {
3340 cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL);
3341 if (cifs_sb->prepath == NULL)
3342 return -ENOMEM;
3343 }
3344
3345 return 0;
3319} 3346}
3320 3347
3321static void 3348static void
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c30cf49b69d2..2c6312db8516 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
333 if (bin_attr->cb_max_size && 333 if (bin_attr->cb_max_size &&
334 *ppos + count > bin_attr->cb_max_size) { 334 *ppos + count > bin_attr->cb_max_size) {
335 len = -EFBIG; 335 len = -EFBIG;
336 goto out;
336 } 337 }
337 338
338 tbuf = vmalloc(*ppos + count); 339 tbuf = vmalloc(*ppos + count);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 0f9961eede1e..ed115acb5dee 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,6 +11,7 @@
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/fscrypto.h> 13#include <linux/fscrypto.h>
14#include <linux/mount.h>
14 15
15static int inode_has_encryption_context(struct inode *inode) 16static int inode_has_encryption_context(struct inode *inode)
16{ 17{
@@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode,
92 return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); 93 return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
93} 94}
94 95
95int fscrypt_process_policy(struct inode *inode, 96int fscrypt_process_policy(struct file *filp,
96 const struct fscrypt_policy *policy) 97 const struct fscrypt_policy *policy)
97{ 98{
99 struct inode *inode = file_inode(filp);
100 int ret;
101
102 if (!inode_owner_or_capable(inode))
103 return -EACCES;
104
98 if (policy->version != 0) 105 if (policy->version != 0)
99 return -EINVAL; 106 return -EINVAL;
100 107
108 ret = mnt_want_write_file(filp);
109 if (ret)
110 return ret;
111
101 if (!inode_has_encryption_context(inode)) { 112 if (!inode_has_encryption_context(inode)) {
102 if (!inode->i_sb->s_cop->empty_dir) 113 if (!S_ISDIR(inode->i_mode))
103 return -EOPNOTSUPP; 114 ret = -EINVAL;
104 if (!inode->i_sb->s_cop->empty_dir(inode)) 115 else if (!inode->i_sb->s_cop->empty_dir)
105 return -ENOTEMPTY; 116 ret = -EOPNOTSUPP;
106 return create_encryption_context_from_policy(inode, policy); 117 else if (!inode->i_sb->s_cop->empty_dir(inode))
118 ret = -ENOTEMPTY;
119 else
120 ret = create_encryption_context_from_policy(inode,
121 policy);
122 } else if (!is_encryption_context_consistent_with_policy(inode,
123 policy)) {
124 printk(KERN_WARNING
125 "%s: Policy inconsistent with encryption context\n",
126 __func__);
127 ret = -EINVAL;
107 } 128 }
108 129
109 if (is_encryption_context_consistent_with_policy(inode, policy)) 130 mnt_drop_write_file(filp);
110 return 0; 131 return ret;
111
112 printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
113 __func__);
114 return -EINVAL;
115} 132}
116EXPORT_SYMBOL(fscrypt_process_policy); 133EXPORT_SYMBOL(fscrypt_process_policy);
117 134
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d116453b0276..79a5941c2474 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -585,7 +585,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
585 */ 585 */
586void *devpts_get_priv(struct dentry *dentry) 586void *devpts_get_priv(struct dentry *dentry)
587{ 587{
588 WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC); 588 if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC)
589 return NULL;
589 return dentry->d_fsdata; 590 return dentry->d_fsdata;
590} 591}
591 592
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index eea64912c9c0..466f7d60edc2 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -607,20 +607,54 @@ static const struct file_operations format2_fops;
607static const struct file_operations format3_fops; 607static const struct file_operations format3_fops;
608static const struct file_operations format4_fops; 608static const struct file_operations format4_fops;
609 609
610static int table_open(struct inode *inode, struct file *file) 610static int table_open1(struct inode *inode, struct file *file)
611{ 611{
612 struct seq_file *seq; 612 struct seq_file *seq;
613 int ret = -1; 613 int ret;
614 614
615 if (file->f_op == &format1_fops) 615 ret = seq_open(file, &format1_seq_ops);
616 ret = seq_open(file, &format1_seq_ops); 616 if (ret)
617 else if (file->f_op == &format2_fops) 617 return ret;
618 ret = seq_open(file, &format2_seq_ops); 618
619 else if (file->f_op == &format3_fops) 619 seq = file->private_data;
620 ret = seq_open(file, &format3_seq_ops); 620 seq->private = inode->i_private; /* the dlm_ls */
621 else if (file->f_op == &format4_fops) 621 return 0;
622 ret = seq_open(file, &format4_seq_ops); 622}
623
624static int table_open2(struct inode *inode, struct file *file)
625{
626 struct seq_file *seq;
627 int ret;
628
629 ret = seq_open(file, &format2_seq_ops);
630 if (ret)
631 return ret;
632
633 seq = file->private_data;
634 seq->private = inode->i_private; /* the dlm_ls */
635 return 0;
636}
637
638static int table_open3(struct inode *inode, struct file *file)
639{
640 struct seq_file *seq;
641 int ret;
642
643 ret = seq_open(file, &format3_seq_ops);
644 if (ret)
645 return ret;
646
647 seq = file->private_data;
648 seq->private = inode->i_private; /* the dlm_ls */
649 return 0;
650}
651
652static int table_open4(struct inode *inode, struct file *file)
653{
654 struct seq_file *seq;
655 int ret;
623 656
657 ret = seq_open(file, &format4_seq_ops);
624 if (ret) 658 if (ret)
625 return ret; 659 return ret;
626 660
@@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file)
631 665
632static const struct file_operations format1_fops = { 666static const struct file_operations format1_fops = {
633 .owner = THIS_MODULE, 667 .owner = THIS_MODULE,
634 .open = table_open, 668 .open = table_open1,
635 .read = seq_read, 669 .read = seq_read,
636 .llseek = seq_lseek, 670 .llseek = seq_lseek,
637 .release = seq_release 671 .release = seq_release
@@ -639,7 +673,7 @@ static const struct file_operations format1_fops = {
639 673
640static const struct file_operations format2_fops = { 674static const struct file_operations format2_fops = {
641 .owner = THIS_MODULE, 675 .owner = THIS_MODULE,
642 .open = table_open, 676 .open = table_open2,
643 .read = seq_read, 677 .read = seq_read,
644 .llseek = seq_lseek, 678 .llseek = seq_lseek,
645 .release = seq_release 679 .release = seq_release
@@ -647,7 +681,7 @@ static const struct file_operations format2_fops = {
647 681
648static const struct file_operations format3_fops = { 682static const struct file_operations format3_fops = {
649 .owner = THIS_MODULE, 683 .owner = THIS_MODULE,
650 .open = table_open, 684 .open = table_open3,
651 .read = seq_read, 685 .read = seq_read,
652 .llseek = seq_lseek, 686 .llseek = seq_lseek,
653 .release = seq_release 687 .release = seq_release
@@ -655,7 +689,7 @@ static const struct file_operations format3_fops = {
655 689
656static const struct file_operations format4_fops = { 690static const struct file_operations format4_fops = {
657 .owner = THIS_MODULE, 691 .owner = THIS_MODULE,
658 .open = table_open, 692 .open = table_open4,
659 .read = seq_read, 693 .read = seq_read,
660 .llseek = seq_lseek, 694 .llseek = seq_lseek,
661 .release = seq_release 695 .release = seq_release
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3131747199e1..c6ea25a190f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5466 sbi->s_want_extra_isize, 5466 sbi->s_want_extra_isize,
5467 iloc, handle); 5467 iloc, handle);
5468 if (ret) { 5468 if (ret) {
5469 ext4_set_inode_state(inode,
5470 EXT4_STATE_NO_EXPAND);
5471 if (mnt_count != 5469 if (mnt_count !=
5472 le16_to_cpu(sbi->s_es->s_mnt_count)) { 5470 le16_to_cpu(sbi->s_es->s_mnt_count)) {
5473 ext4_warning(inode->i_sb, 5471 ext4_warning(inode->i_sb,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 10686fd67fb4..1bb7df5e4536 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -776,7 +776,7 @@ resizefs_out:
776 (struct fscrypt_policy __user *)arg, 776 (struct fscrypt_policy __user *)arg,
777 sizeof(policy))) 777 sizeof(policy)))
778 return -EFAULT; 778 return -EFAULT;
779 return fscrypt_process_policy(inode, &policy); 779 return fscrypt_process_policy(filp, &policy);
780#else 780#else
781 return -EOPNOTSUPP; 781 return -EOPNOTSUPP;
782#endif 782#endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1c593aa0218e..3ec8708989ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2211 2211
2212/* Called at mount-time, super-block is locked */ 2212/* Called at mount-time, super-block is locked */
2213static int ext4_check_descriptors(struct super_block *sb, 2213static int ext4_check_descriptors(struct super_block *sb,
2214 ext4_fsblk_t sb_block,
2214 ext4_group_t *first_not_zeroed) 2215 ext4_group_t *first_not_zeroed)
2215{ 2216{
2216 struct ext4_sb_info *sbi = EXT4_SB(sb); 2217 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb,
2241 grp = i; 2242 grp = i;
2242 2243
2243 block_bitmap = ext4_block_bitmap(sb, gdp); 2244 block_bitmap = ext4_block_bitmap(sb, gdp);
2245 if (block_bitmap == sb_block) {
2246 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2247 "Block bitmap for group %u overlaps "
2248 "superblock", i);
2249 }
2244 if (block_bitmap < first_block || block_bitmap > last_block) { 2250 if (block_bitmap < first_block || block_bitmap > last_block) {
2245 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2251 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2246 "Block bitmap for group %u not in group " 2252 "Block bitmap for group %u not in group "
@@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb,
2248 return 0; 2254 return 0;
2249 } 2255 }
2250 inode_bitmap = ext4_inode_bitmap(sb, gdp); 2256 inode_bitmap = ext4_inode_bitmap(sb, gdp);
2257 if (inode_bitmap == sb_block) {
2258 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2259 "Inode bitmap for group %u overlaps "
2260 "superblock", i);
2261 }
2251 if (inode_bitmap < first_block || inode_bitmap > last_block) { 2262 if (inode_bitmap < first_block || inode_bitmap > last_block) {
2252 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2263 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2253 "Inode bitmap for group %u not in group " 2264 "Inode bitmap for group %u not in group "
@@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb,
2255 return 0; 2266 return 0;
2256 } 2267 }
2257 inode_table = ext4_inode_table(sb, gdp); 2268 inode_table = ext4_inode_table(sb, gdp);
2269 if (inode_table == sb_block) {
2270 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2271 "Inode table for group %u overlaps "
2272 "superblock", i);
2273 }
2258 if (inode_table < first_block || 2274 if (inode_table < first_block ||
2259 inode_table + sbi->s_itb_per_group - 1 > last_block) { 2275 inode_table + sbi->s_itb_per_group - 1 > last_block) {
2260 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " 2276 ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3757 goto failed_mount2; 3773 goto failed_mount2;
3758 } 3774 }
3759 } 3775 }
3760 if (!ext4_check_descriptors(sb, &first_not_zeroed)) { 3776 if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
3761 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3777 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3762 ret = -EFSCORRUPTED; 3778 ret = -EFSCORRUPTED;
3763 goto failed_mount2; 3779 goto failed_mount2;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 39e9cfb1b371..2eb935ca5d9e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
1353 size_t min_offs, free; 1353 size_t min_offs, free;
1354 int total_ino; 1354 int total_ino;
1355 void *base, *start, *end; 1355 void *base, *start, *end;
1356 int extra_isize = 0, error = 0, tried_min_extra_isize = 0; 1356 int error = 0, tried_min_extra_isize = 0;
1357 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); 1357 int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
1358 int isize_diff; /* How much do we need to grow i_extra_isize */
1358 1359
1359 down_write(&EXT4_I(inode)->xattr_sem); 1360 down_write(&EXT4_I(inode)->xattr_sem);
1361 /*
1362 * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty
1363 */
1364 ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
1360retry: 1365retry:
1361 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) { 1366 isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
1362 up_write(&EXT4_I(inode)->xattr_sem); 1367 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
1363 return 0; 1368 goto out;
1364 }
1365 1369
1366 header = IHDR(inode, raw_inode); 1370 header = IHDR(inode, raw_inode);
1367 entry = IFIRST(header); 1371 entry = IFIRST(header);
@@ -1382,7 +1386,7 @@ retry:
1382 goto cleanup; 1386 goto cleanup;
1383 1387
1384 free = ext4_xattr_free_space(last, &min_offs, base, &total_ino); 1388 free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
1385 if (free >= new_extra_isize) { 1389 if (free >= isize_diff) {
1386 entry = IFIRST(header); 1390 entry = IFIRST(header);
1387 ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize 1391 ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
1388 - new_extra_isize, (void *)raw_inode + 1392 - new_extra_isize, (void *)raw_inode +
@@ -1390,8 +1394,7 @@ retry:
1390 (void *)header, total_ino, 1394 (void *)header, total_ino,
1391 inode->i_sb->s_blocksize); 1395 inode->i_sb->s_blocksize);
1392 EXT4_I(inode)->i_extra_isize = new_extra_isize; 1396 EXT4_I(inode)->i_extra_isize = new_extra_isize;
1393 error = 0; 1397 goto out;
1394 goto cleanup;
1395 } 1398 }
1396 1399
1397 /* 1400 /*
@@ -1414,7 +1417,7 @@ retry:
1414 end = bh->b_data + bh->b_size; 1417 end = bh->b_data + bh->b_size;
1415 min_offs = end - base; 1418 min_offs = end - base;
1416 free = ext4_xattr_free_space(first, &min_offs, base, NULL); 1419 free = ext4_xattr_free_space(first, &min_offs, base, NULL);
1417 if (free < new_extra_isize) { 1420 if (free < isize_diff) {
1418 if (!tried_min_extra_isize && s_min_extra_isize) { 1421 if (!tried_min_extra_isize && s_min_extra_isize) {
1419 tried_min_extra_isize++; 1422 tried_min_extra_isize++;
1420 new_extra_isize = s_min_extra_isize; 1423 new_extra_isize = s_min_extra_isize;
@@ -1428,7 +1431,7 @@ retry:
1428 free = inode->i_sb->s_blocksize; 1431 free = inode->i_sb->s_blocksize;
1429 } 1432 }
1430 1433
1431 while (new_extra_isize > 0) { 1434 while (isize_diff > 0) {
1432 size_t offs, size, entry_size; 1435 size_t offs, size, entry_size;
1433 struct ext4_xattr_entry *small_entry = NULL; 1436 struct ext4_xattr_entry *small_entry = NULL;
1434 struct ext4_xattr_info i = { 1437 struct ext4_xattr_info i = {
@@ -1459,7 +1462,7 @@ retry:
1459 EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) + 1462 EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
1460 EXT4_XATTR_LEN(last->e_name_len); 1463 EXT4_XATTR_LEN(last->e_name_len);
1461 if (total_size <= free && total_size < min_total_size) { 1464 if (total_size <= free && total_size < min_total_size) {
1462 if (total_size < new_extra_isize) { 1465 if (total_size < isize_diff) {
1463 small_entry = last; 1466 small_entry = last;
1464 } else { 1467 } else {
1465 entry = last; 1468 entry = last;
@@ -1514,22 +1517,22 @@ retry:
1514 error = ext4_xattr_ibody_set(handle, inode, &i, is); 1517 error = ext4_xattr_ibody_set(handle, inode, &i, is);
1515 if (error) 1518 if (error)
1516 goto cleanup; 1519 goto cleanup;
1520 total_ino -= entry_size;
1517 1521
1518 entry = IFIRST(header); 1522 entry = IFIRST(header);
1519 if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize) 1523 if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff)
1520 shift_bytes = new_extra_isize; 1524 shift_bytes = isize_diff;
1521 else 1525 else
1522 shift_bytes = entry_size + size; 1526 shift_bytes = entry_size + EXT4_XATTR_SIZE(size);
1523 /* Adjust the offsets and shift the remaining entries ahead */ 1527 /* Adjust the offsets and shift the remaining entries ahead */
1524 ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize - 1528 ext4_xattr_shift_entries(entry, -shift_bytes,
1525 shift_bytes, (void *)raw_inode + 1529 (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
1526 EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes, 1530 EXT4_I(inode)->i_extra_isize + shift_bytes,
1527 (void *)header, total_ino - entry_size, 1531 (void *)header, total_ino, inode->i_sb->s_blocksize);
1528 inode->i_sb->s_blocksize);
1529 1532
1530 extra_isize += shift_bytes; 1533 isize_diff -= shift_bytes;
1531 new_extra_isize -= shift_bytes; 1534 EXT4_I(inode)->i_extra_isize += shift_bytes;
1532 EXT4_I(inode)->i_extra_isize = extra_isize; 1535 header = IHDR(inode, raw_inode);
1533 1536
1534 i.name = b_entry_name; 1537 i.name = b_entry_name;
1535 i.value = buffer; 1538 i.value = buffer;
@@ -1551,6 +1554,8 @@ retry:
1551 kfree(bs); 1554 kfree(bs);
1552 } 1555 }
1553 brelse(bh); 1556 brelse(bh);
1557out:
1558 ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
1554 up_write(&EXT4_I(inode)->xattr_sem); 1559 up_write(&EXT4_I(inode)->xattr_sem);
1555 return 0; 1560 return 0;
1556 1561
@@ -1562,6 +1567,10 @@ cleanup:
1562 kfree(is); 1567 kfree(is);
1563 kfree(bs); 1568 kfree(bs);
1564 brelse(bh); 1569 brelse(bh);
1570 /*
1571 * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
1572 * size expansion failed.
1573 */
1565 up_write(&EXT4_I(inode)->xattr_sem); 1574 up_write(&EXT4_I(inode)->xattr_sem);
1566 return error; 1575 return error;
1567} 1576}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69dd3e6566e0..a92e783fa057 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -24,6 +24,7 @@
24#define EXT4_XATTR_INDEX_SYSTEM 7 24#define EXT4_XATTR_INDEX_SYSTEM 7
25#define EXT4_XATTR_INDEX_RICHACL 8 25#define EXT4_XATTR_INDEX_RICHACL 8
26#define EXT4_XATTR_INDEX_ENCRYPTION 9 26#define EXT4_XATTR_INDEX_ENCRYPTION 9
27#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */
27 28
28struct ext4_xattr_header { 29struct ext4_xattr_header {
29 __le32 h_magic; /* magic number for identification */ 30 __le32 h_magic; /* magic number for identification */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d64d2a515cb2..ccb401eebc11 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file,
1699 trace_f2fs_write_end(inode, pos, len, copied); 1699 trace_f2fs_write_end(inode, pos, len, copied);
1700 1700
1701 set_page_dirty(page); 1701 set_page_dirty(page);
1702 f2fs_put_page(page, 1);
1703 1702
1704 if (pos + copied > i_size_read(inode)) 1703 if (pos + copied > i_size_read(inode))
1705 f2fs_i_size_write(inode, pos + copied); 1704 f2fs_i_size_write(inode, pos + copied);
1706 1705
1706 f2fs_put_page(page, 1);
1707 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); 1707 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
1708 return copied; 1708 return copied;
1709} 1709}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 675fa79d86f6..14f5fe2b841e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -538,7 +538,7 @@ struct f2fs_nm_info {
538 /* NAT cache management */ 538 /* NAT cache management */
539 struct radix_tree_root nat_root;/* root of the nat entry cache */ 539 struct radix_tree_root nat_root;/* root of the nat entry cache */
540 struct radix_tree_root nat_set_root;/* root of the nat set cache */ 540 struct radix_tree_root nat_set_root;/* root of the nat set cache */
541 struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ 541 struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
542 struct list_head nat_entries; /* cached nat entry list (clean) */ 542 struct list_head nat_entries; /* cached nat entry list (clean) */
543 unsigned int nat_cnt; /* the # of cached nat entries */ 543 unsigned int nat_cnt; /* the # of cached nat entries */
544 unsigned int dirty_nat_cnt; /* total num of nat entries in set */ 544 unsigned int dirty_nat_cnt; /* total num of nat entries in set */
@@ -787,7 +787,7 @@ struct f2fs_sb_info {
787 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ 787 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
788 struct inode *meta_inode; /* cache meta blocks */ 788 struct inode *meta_inode; /* cache meta blocks */
789 struct mutex cp_mutex; /* checkpoint procedure lock */ 789 struct mutex cp_mutex; /* checkpoint procedure lock */
790 struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ 790 struct rw_semaphore cp_rwsem; /* blocking FS operations */
791 struct rw_semaphore node_write; /* locking node writes */ 791 struct rw_semaphore node_write; /* locking node writes */
792 wait_queue_head_t cp_wait; 792 wait_queue_head_t cp_wait;
793 unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ 793 unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
@@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
1074 1074
1075static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) 1075static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
1076{ 1076{
1077 percpu_down_read(&sbi->cp_rwsem); 1077 down_read(&sbi->cp_rwsem);
1078} 1078}
1079 1079
1080static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) 1080static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
1081{ 1081{
1082 percpu_up_read(&sbi->cp_rwsem); 1082 up_read(&sbi->cp_rwsem);
1083} 1083}
1084 1084
1085static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) 1085static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
1086{ 1086{
1087 percpu_down_write(&sbi->cp_rwsem); 1087 down_write(&sbi->cp_rwsem);
1088} 1088}
1089 1089
1090static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) 1090static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
1091{ 1091{
1092 percpu_up_write(&sbi->cp_rwsem); 1092 up_write(&sbi->cp_rwsem);
1093} 1093}
1094 1094
1095static inline int __get_cp_reason(struct f2fs_sb_info *sbi) 1095static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0e493f63ea41..28f4f4cbb8d8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
1757{ 1757{
1758 struct fscrypt_policy policy; 1758 struct fscrypt_policy policy;
1759 struct inode *inode = file_inode(filp); 1759 struct inode *inode = file_inode(filp);
1760 int ret;
1761 1760
1762 if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, 1761 if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
1763 sizeof(policy))) 1762 sizeof(policy)))
1764 return -EFAULT; 1763 return -EFAULT;
1765 1764
1766 ret = mnt_want_write_file(filp);
1767 if (ret)
1768 return ret;
1769
1770 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); 1765 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
1771 ret = fscrypt_process_policy(inode, &policy);
1772 1766
1773 mnt_drop_write_file(filp); 1767 return fscrypt_process_policy(filp, &policy);
1774 return ret;
1775} 1768}
1776 1769
1777static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) 1770static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
2086 if (unlikely(f2fs_readonly(src->i_sb))) 2079 if (unlikely(f2fs_readonly(src->i_sb)))
2087 return -EROFS; 2080 return -EROFS;
2088 2081
2089 if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) 2082 if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
2090 return -EISDIR; 2083 return -EINVAL;
2091 2084
2092 if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) 2085 if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
2093 return -EOPNOTSUPP; 2086 return -EOPNOTSUPP;
2094 2087
2095 inode_lock(src); 2088 inode_lock(src);
2096 if (src != dst) 2089 if (src != dst) {
2097 inode_lock(dst); 2090 if (!inode_trylock(dst)) {
2091 ret = -EBUSY;
2092 goto out;
2093 }
2094 }
2098 2095
2099 ret = -EINVAL; 2096 ret = -EINVAL;
2100 if (pos_in + len > src->i_size || pos_in + len < pos_in) 2097 if (pos_in + len > src->i_size || pos_in + len < pos_in)
@@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
2152out_unlock: 2149out_unlock:
2153 if (src != dst) 2150 if (src != dst)
2154 inode_unlock(dst); 2151 inode_unlock(dst);
2152out:
2155 inode_unlock(src); 2153 inode_unlock(src);
2156 return ret; 2154 return ret;
2157} 2155}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b2fa4b615925..f75d197d5beb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
206 struct nat_entry *e; 206 struct nat_entry *e;
207 bool need = false; 207 bool need = false;
208 208
209 percpu_down_read(&nm_i->nat_tree_lock); 209 down_read(&nm_i->nat_tree_lock);
210 e = __lookup_nat_cache(nm_i, nid); 210 e = __lookup_nat_cache(nm_i, nid);
211 if (e) { 211 if (e) {
212 if (!get_nat_flag(e, IS_CHECKPOINTED) && 212 if (!get_nat_flag(e, IS_CHECKPOINTED) &&
213 !get_nat_flag(e, HAS_FSYNCED_INODE)) 213 !get_nat_flag(e, HAS_FSYNCED_INODE))
214 need = true; 214 need = true;
215 } 215 }
216 percpu_up_read(&nm_i->nat_tree_lock); 216 up_read(&nm_i->nat_tree_lock);
217 return need; 217 return need;
218} 218}
219 219
@@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
223 struct nat_entry *e; 223 struct nat_entry *e;
224 bool is_cp = true; 224 bool is_cp = true;
225 225
226 percpu_down_read(&nm_i->nat_tree_lock); 226 down_read(&nm_i->nat_tree_lock);
227 e = __lookup_nat_cache(nm_i, nid); 227 e = __lookup_nat_cache(nm_i, nid);
228 if (e && !get_nat_flag(e, IS_CHECKPOINTED)) 228 if (e && !get_nat_flag(e, IS_CHECKPOINTED))
229 is_cp = false; 229 is_cp = false;
230 percpu_up_read(&nm_i->nat_tree_lock); 230 up_read(&nm_i->nat_tree_lock);
231 return is_cp; 231 return is_cp;
232} 232}
233 233
@@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
237 struct nat_entry *e; 237 struct nat_entry *e;
238 bool need_update = true; 238 bool need_update = true;
239 239
240 percpu_down_read(&nm_i->nat_tree_lock); 240 down_read(&nm_i->nat_tree_lock);
241 e = __lookup_nat_cache(nm_i, ino); 241 e = __lookup_nat_cache(nm_i, ino);
242 if (e && get_nat_flag(e, HAS_LAST_FSYNC) && 242 if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
243 (get_nat_flag(e, IS_CHECKPOINTED) || 243 (get_nat_flag(e, IS_CHECKPOINTED) ||
244 get_nat_flag(e, HAS_FSYNCED_INODE))) 244 get_nat_flag(e, HAS_FSYNCED_INODE)))
245 need_update = false; 245 need_update = false;
246 percpu_up_read(&nm_i->nat_tree_lock); 246 up_read(&nm_i->nat_tree_lock);
247 return need_update; 247 return need_update;
248} 248}
249 249
@@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
284 struct f2fs_nm_info *nm_i = NM_I(sbi); 284 struct f2fs_nm_info *nm_i = NM_I(sbi);
285 struct nat_entry *e; 285 struct nat_entry *e;
286 286
287 percpu_down_write(&nm_i->nat_tree_lock); 287 down_write(&nm_i->nat_tree_lock);
288 e = __lookup_nat_cache(nm_i, ni->nid); 288 e = __lookup_nat_cache(nm_i, ni->nid);
289 if (!e) { 289 if (!e) {
290 e = grab_nat_entry(nm_i, ni->nid); 290 e = grab_nat_entry(nm_i, ni->nid);
@@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
334 set_nat_flag(e, HAS_FSYNCED_INODE, true); 334 set_nat_flag(e, HAS_FSYNCED_INODE, true);
335 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); 335 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
336 } 336 }
337 percpu_up_write(&nm_i->nat_tree_lock); 337 up_write(&nm_i->nat_tree_lock);
338} 338}
339 339
340int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) 340int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
342 struct f2fs_nm_info *nm_i = NM_I(sbi); 342 struct f2fs_nm_info *nm_i = NM_I(sbi);
343 int nr = nr_shrink; 343 int nr = nr_shrink;
344 344
345 percpu_down_write(&nm_i->nat_tree_lock); 345 if (!down_write_trylock(&nm_i->nat_tree_lock))
346 return 0;
346 347
347 while (nr_shrink && !list_empty(&nm_i->nat_entries)) { 348 while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
348 struct nat_entry *ne; 349 struct nat_entry *ne;
@@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
351 __del_from_nat_cache(nm_i, ne); 352 __del_from_nat_cache(nm_i, ne);
352 nr_shrink--; 353 nr_shrink--;
353 } 354 }
354 percpu_up_write(&nm_i->nat_tree_lock); 355 up_write(&nm_i->nat_tree_lock);
355 return nr - nr_shrink; 356 return nr - nr_shrink;
356} 357}
357 358
@@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
373 ni->nid = nid; 374 ni->nid = nid;
374 375
375 /* Check nat cache */ 376 /* Check nat cache */
376 percpu_down_read(&nm_i->nat_tree_lock); 377 down_read(&nm_i->nat_tree_lock);
377 e = __lookup_nat_cache(nm_i, nid); 378 e = __lookup_nat_cache(nm_i, nid);
378 if (e) { 379 if (e) {
379 ni->ino = nat_get_ino(e); 380 ni->ino = nat_get_ino(e);
380 ni->blk_addr = nat_get_blkaddr(e); 381 ni->blk_addr = nat_get_blkaddr(e);
381 ni->version = nat_get_version(e); 382 ni->version = nat_get_version(e);
382 percpu_up_read(&nm_i->nat_tree_lock); 383 up_read(&nm_i->nat_tree_lock);
383 return; 384 return;
384 } 385 }
385 386
@@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
403 node_info_from_raw_nat(ni, &ne); 404 node_info_from_raw_nat(ni, &ne);
404 f2fs_put_page(page, 1); 405 f2fs_put_page(page, 1);
405cache: 406cache:
406 percpu_up_read(&nm_i->nat_tree_lock); 407 up_read(&nm_i->nat_tree_lock);
407 /* cache nat entry */ 408 /* cache nat entry */
408 percpu_down_write(&nm_i->nat_tree_lock); 409 down_write(&nm_i->nat_tree_lock);
409 cache_nat_entry(sbi, nid, &ne); 410 cache_nat_entry(sbi, nid, &ne);
410 percpu_up_write(&nm_i->nat_tree_lock); 411 up_write(&nm_i->nat_tree_lock);
411} 412}
412 413
413/* 414/*
@@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
1788 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, 1789 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
1789 META_NAT, true); 1790 META_NAT, true);
1790 1791
1791 percpu_down_read(&nm_i->nat_tree_lock); 1792 down_read(&nm_i->nat_tree_lock);
1792 1793
1793 while (1) { 1794 while (1) {
1794 struct page *page = get_current_nat_page(sbi, nid); 1795 struct page *page = get_current_nat_page(sbi, nid);
@@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
1820 remove_free_nid(nm_i, nid); 1821 remove_free_nid(nm_i, nid);
1821 } 1822 }
1822 up_read(&curseg->journal_rwsem); 1823 up_read(&curseg->journal_rwsem);
1823 percpu_up_read(&nm_i->nat_tree_lock); 1824 up_read(&nm_i->nat_tree_lock);
1824 1825
1825 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), 1826 ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
1826 nm_i->ra_nid_pages, META_NAT, false); 1827 nm_i->ra_nid_pages, META_NAT, false);
@@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
2209 if (!nm_i->dirty_nat_cnt) 2210 if (!nm_i->dirty_nat_cnt)
2210 return; 2211 return;
2211 2212
2212 percpu_down_write(&nm_i->nat_tree_lock); 2213 down_write(&nm_i->nat_tree_lock);
2213 2214
2214 /* 2215 /*
2215 * if there are no enough space in journal to store dirty nat 2216 * if there are no enough space in journal to store dirty nat
@@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
2232 list_for_each_entry_safe(set, tmp, &sets, set_list) 2233 list_for_each_entry_safe(set, tmp, &sets, set_list)
2233 __flush_nat_entry_set(sbi, set); 2234 __flush_nat_entry_set(sbi, set);
2234 2235
2235 percpu_up_write(&nm_i->nat_tree_lock); 2236 up_write(&nm_i->nat_tree_lock);
2236 2237
2237 f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); 2238 f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
2238} 2239}
@@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
2268 2269
2269 mutex_init(&nm_i->build_lock); 2270 mutex_init(&nm_i->build_lock);
2270 spin_lock_init(&nm_i->free_nid_list_lock); 2271 spin_lock_init(&nm_i->free_nid_list_lock);
2271 if (percpu_init_rwsem(&nm_i->nat_tree_lock)) 2272 init_rwsem(&nm_i->nat_tree_lock);
2272 return -ENOMEM;
2273 2273
2274 nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); 2274 nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
2275 nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); 2275 nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2326 spin_unlock(&nm_i->free_nid_list_lock); 2326 spin_unlock(&nm_i->free_nid_list_lock);
2327 2327
2328 /* destroy nat cache */ 2328 /* destroy nat cache */
2329 percpu_down_write(&nm_i->nat_tree_lock); 2329 down_write(&nm_i->nat_tree_lock);
2330 while ((found = __gang_lookup_nat_cache(nm_i, 2330 while ((found = __gang_lookup_nat_cache(nm_i,
2331 nid, NATVEC_SIZE, natvec))) { 2331 nid, NATVEC_SIZE, natvec))) {
2332 unsigned idx; 2332 unsigned idx;
@@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2351 kmem_cache_free(nat_entry_set_slab, setvec[idx]); 2351 kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2352 } 2352 }
2353 } 2353 }
2354 percpu_up_write(&nm_i->nat_tree_lock); 2354 up_write(&nm_i->nat_tree_lock);
2355 2355
2356 percpu_free_rwsem(&nm_i->nat_tree_lock);
2357 kfree(nm_i->nat_bitmap); 2356 kfree(nm_i->nat_bitmap);
2358 sbi->nm_info = NULL; 2357 sbi->nm_info = NULL;
2359 kfree(nm_i); 2358 kfree(nm_i);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1b86d3f638ef..7f863a645ab1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
706 percpu_counter_destroy(&sbi->nr_pages[i]); 706 percpu_counter_destroy(&sbi->nr_pages[i]);
707 percpu_counter_destroy(&sbi->alloc_valid_block_count); 707 percpu_counter_destroy(&sbi->alloc_valid_block_count);
708 percpu_counter_destroy(&sbi->total_valid_inode_count); 708 percpu_counter_destroy(&sbi->total_valid_inode_count);
709
710 percpu_free_rwsem(&sbi->cp_rwsem);
711} 709}
712 710
713static void f2fs_put_super(struct super_block *sb) 711static void f2fs_put_super(struct super_block *sb)
@@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
1483{ 1481{
1484 int i, err; 1482 int i, err;
1485 1483
1486 if (percpu_init_rwsem(&sbi->cp_rwsem))
1487 return -ENOMEM;
1488
1489 for (i = 0; i < NR_COUNT_TYPE; i++) { 1484 for (i = 0; i < NR_COUNT_TYPE; i++) {
1490 err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); 1485 err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
1491 if (err) 1486 if (err)
@@ -1686,6 +1681,7 @@ try_onemore:
1686 sbi->write_io[i].bio = NULL; 1681 sbi->write_io[i].bio = NULL;
1687 } 1682 }
1688 1683
1684 init_rwsem(&sbi->cp_rwsem);
1689 init_waitqueue_head(&sbi->cp_wait); 1685 init_waitqueue_head(&sbi->cp_wait);
1690 init_sb_info(sbi); 1686 init_sb_info(sbi);
1691 1687
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d09d4441e3e..05713a5da083 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
1949{ 1949{
1950 struct backing_dev_info *bdi; 1950 struct backing_dev_info *bdi;
1951 1951
1952 /*
1953 * If we are expecting writeback progress we must submit plugged IO.
1954 */
1955 if (blk_needs_flush_plug(current))
1956 blk_schedule_flush_plug(current);
1957
1952 if (!nr_pages) 1958 if (!nr_pages)
1953 nr_pages = get_nr_dirty_pages(); 1959 nr_pages = get_nr_dirty_pages();
1954 1960
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f394aff59c36..3988b43c2f5a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
530 req->out.args[0].size = count; 530 req->out.args[0].size = count;
531} 531}
532 532
533static void fuse_release_user_pages(struct fuse_req *req, int write) 533static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
534{ 534{
535 unsigned i; 535 unsigned i;
536 536
537 for (i = 0; i < req->num_pages; i++) { 537 for (i = 0; i < req->num_pages; i++) {
538 struct page *page = req->pages[i]; 538 struct page *page = req->pages[i];
539 if (write) 539 if (should_dirty)
540 set_page_dirty_lock(page); 540 set_page_dirty_lock(page);
541 put_page(page); 541 put_page(page);
542 } 542 }
@@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1320 loff_t *ppos, int flags) 1320 loff_t *ppos, int flags)
1321{ 1321{
1322 int write = flags & FUSE_DIO_WRITE; 1322 int write = flags & FUSE_DIO_WRITE;
1323 bool should_dirty = !write && iter_is_iovec(iter);
1323 int cuse = flags & FUSE_DIO_CUSE; 1324 int cuse = flags & FUSE_DIO_CUSE;
1324 struct file *file = io->file; 1325 struct file *file = io->file;
1325 struct inode *inode = file->f_mapping->host; 1326 struct inode *inode = file->f_mapping->host;
@@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1363 nres = fuse_send_read(req, io, pos, nbytes, owner); 1364 nres = fuse_send_read(req, io, pos, nbytes, owner);
1364 1365
1365 if (!io->async) 1366 if (!io->async)
1366 fuse_release_user_pages(req, !write); 1367 fuse_release_user_pages(req, should_dirty);
1367 if (req->out.h.error) { 1368 if (req->out.h.error) {
1368 err = req->out.h.error; 1369 err = req->out.h.error;
1369 break; 1370 break;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0f56deb24ce6..c415668c86d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp)
568 return thaw_super(sb); 568 return thaw_super(sb);
569} 569}
570 570
571static long ioctl_file_dedupe_range(struct file *file, void __user *arg) 571static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
572{ 572{
573 struct file_dedupe_range __user *argp = arg; 573 struct file_dedupe_range __user *argp = arg;
574 struct file_dedupe_range *same = NULL; 574 struct file_dedupe_range *same = NULL;
@@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
582 } 582 }
583 583
584 size = offsetof(struct file_dedupe_range __user, info[count]); 584 size = offsetof(struct file_dedupe_range __user, info[count]);
585 if (size > PAGE_SIZE) {
586 ret = -ENOMEM;
587 goto out;
588 }
585 589
586 same = memdup_user(argp, size); 590 same = memdup_user(argp, size);
587 if (IS_ERR(same)) { 591 if (IS_ERR(same)) {
diff --git a/fs/iomap.c b/fs/iomap.c
index 48141b8eff5f..706270f21b35 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
84 * Now the data has been copied, commit the range we've copied. This 84 * Now the data has been copied, commit the range we've copied. This
85 * should not fail unless the filesystem has had a fatal error. 85 * should not fail unless the filesystem has had a fatal error.
86 */ 86 */
87 ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, 87 if (ops->iomap_end) {
88 flags, &iomap); 88 ret = ops->iomap_end(inode, pos, length,
89 written > 0 ? written : 0,
90 flags, &iomap);
91 }
89 92
90 return written ? written : ret; 93 return written ? written : ret;
91} 94}
@@ -194,12 +197,9 @@ again:
194 if (mapping_writably_mapped(inode->i_mapping)) 197 if (mapping_writably_mapped(inode->i_mapping))
195 flush_dcache_page(page); 198 flush_dcache_page(page);
196 199
197 pagefault_disable();
198 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 200 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
199 pagefault_enable();
200 201
201 flush_dcache_page(page); 202 flush_dcache_page(page);
202 mark_page_accessed(page);
203 203
204 status = iomap_write_end(inode, pos, bytes, copied, page); 204 status = iomap_write_end(inode, pos, bytes, copied, page);
205 if (unlikely(status < 0)) 205 if (unlikely(status < 0))
@@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
428 break; 428 break;
429 } 429 }
430 430
431 if (iomap->flags & IOMAP_F_MERGED)
432 flags |= FIEMAP_EXTENT_MERGED;
433
431 return fiemap_fill_next_extent(fi, iomap->offset, 434 return fiemap_fill_next_extent(fi, iomap->offset,
432 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, 435 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
433 iomap->length, flags | FIEMAP_EXTENT_MERGED); 436 iomap->length, flags);
434 437
435} 438}
436 439
@@ -470,13 +473,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
470 if (ret) 473 if (ret)
471 return ret; 474 return ret;
472 475
473 ret = filemap_write_and_wait(inode->i_mapping); 476 if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
474 if (ret) 477 ret = filemap_write_and_wait(inode->i_mapping);
475 return ret; 478 if (ret)
479 return ret;
480 }
476 481
477 while (len > 0) { 482 while (len > 0) {
478 ret = iomap_apply(inode, start, len, 0, ops, &ctx, 483 ret = iomap_apply(inode, start, len, 0, ops, &ctx,
479 iomap_fiemap_actor); 484 iomap_fiemap_actor);
485 /* inode with no (attribute) mapping will give ENOENT */
486 if (ret == -ENOENT)
487 break;
480 if (ret < 0) 488 if (ret < 0)
481 return ret; 489 return ret;
482 if (ret == 0) 490 if (ret == 0)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e1574008adc9..2bcb86e6e6ca 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -840,21 +840,35 @@ repeat:
840 mutex_lock(&kernfs_mutex); 840 mutex_lock(&kernfs_mutex);
841 841
842 list_for_each_entry(info, &kernfs_root(kn)->supers, node) { 842 list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
843 struct kernfs_node *parent;
843 struct inode *inode; 844 struct inode *inode;
844 struct dentry *dentry;
845 845
846 /*
847 * We want fsnotify_modify() on @kn but as the
848 * modifications aren't originating from userland don't
849 * have the matching @file available. Look up the inodes
850 * and generate the events manually.
851 */
846 inode = ilookup(info->sb, kn->ino); 852 inode = ilookup(info->sb, kn->ino);
847 if (!inode) 853 if (!inode)
848 continue; 854 continue;
849 855
850 dentry = d_find_any_alias(inode); 856 parent = kernfs_get_parent(kn);
851 if (dentry) { 857 if (parent) {
852 fsnotify_parent(NULL, dentry, FS_MODIFY); 858 struct inode *p_inode;
853 fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, 859
854 NULL, 0); 860 p_inode = ilookup(info->sb, parent->ino);
855 dput(dentry); 861 if (p_inode) {
862 fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
863 inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
864 iput(p_inode);
865 }
866
867 kernfs_put(parent);
856 } 868 }
857 869
870 fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
871 kn->name, 0);
858 iput(inode); 872 iput(inode);
859 } 873 }
860 874
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f55a4e756047..217847679f0e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work)
346 PAGE_SIZE - 1) & (loff_t)PAGE_MASK; 346 PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
347 347
348 ext_tree_mark_written(bl, start >> SECTOR_SHIFT, 348 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
349 (end - start) >> SECTOR_SHIFT); 349 (end - start) >> SECTOR_SHIFT, end);
350 } 350 }
351 351
352 pnfs_ld_write_done(hdr); 352 pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
141 struct rb_root bl_ext_ro; 141 struct rb_root bl_ext_ro;
142 spinlock_t bl_ext_lock; /* Protects list manipulation */ 142 spinlock_t bl_ext_lock; /* Protects list manipulation */
143 bool bl_scsi_layout; 143 bool bl_scsi_layout;
144 u64 bl_lwb;
144}; 145};
145 146
146static inline struct pnfs_block_layout * 147static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
182int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, 183int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
183 sector_t end); 184 sector_t end);
184int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 185int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
185 sector_t len); 186 sector_t len, u64 lwb);
186bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 187bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
187 struct pnfs_block_extent *ret, bool rw); 188 struct pnfs_block_extent *ret, bool rw);
188int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); 189int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..c85fbfd2d0d9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
402 402
403int 403int
404ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 404ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
405 sector_t len) 405 sector_t len, u64 lwb)
406{ 406{
407 struct rb_root *root = &bl->bl_ext_rw; 407 struct rb_root *root = &bl->bl_ext_rw;
408 sector_t end = start + len; 408 sector_t end = start + len;
@@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
471 } 471 }
472 } 472 }
473out: 473out:
474 if (bl->bl_lwb < lwb)
475 bl->bl_lwb = lwb;
474 spin_unlock(&bl->bl_ext_lock); 476 spin_unlock(&bl->bl_ext_lock);
475 477
476 __ext_put_deviceids(&tmp); 478 __ext_put_deviceids(&tmp);
@@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
518} 520}
519 521
520static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 522static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
521 size_t buffer_size, size_t *count) 523 size_t buffer_size, size_t *count, __u64 *lastbyte)
522{ 524{
523 struct pnfs_block_extent *be; 525 struct pnfs_block_extent *be;
524 int ret = 0; 526 int ret = 0;
@@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
542 p = encode_block_extent(be, p); 544 p = encode_block_extent(be, p);
543 be->be_tag = EXTENT_COMMITTING; 545 be->be_tag = EXTENT_COMMITTING;
544 } 546 }
547 *lastbyte = bl->bl_lwb - 1;
548 bl->bl_lwb = 0;
545 spin_unlock(&bl->bl_ext_lock); 549 spin_unlock(&bl->bl_ext_lock);
546 550
547 return ret; 551 return ret;
@@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
564 arg->layoutupdate_pages = &arg->layoutupdate_page; 568 arg->layoutupdate_pages = &arg->layoutupdate_page;
565 569
566retry: 570retry:
567 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); 571 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
568 if (unlikely(ret)) { 572 if (unlikely(ret)) {
569 ext_tree_free_commitdata(arg, buffer_size); 573 ext_tree_free_commitdata(arg, buffer_size);
570 574
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a7f2e6e33305..52a28311e2a4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
275err_socks: 275err_socks:
276 svc_rpcb_cleanup(serv, net); 276 svc_rpcb_cleanup(serv, net);
277err_bind: 277err_bind:
278 nn->cb_users[minorversion]--;
278 dprintk("NFS: Couldn't create callback socket: err = %d; " 279 dprintk("NFS: Couldn't create callback socket: err = %d; "
279 "net = %p\n", ret, net); 280 "net = %p\n", ret, net);
280 return ret; 281 return ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c92a75e066a6..f953ef6b2f2e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp,
454 ((u32 *)&rclist->rcl_sessionid.data)[3], 454 ((u32 *)&rclist->rcl_sessionid.data)[3],
455 ref->rc_sequenceid, ref->rc_slotid); 455 ref->rc_sequenceid, ref->rc_slotid);
456 456
457 spin_lock(&tbl->slot_tbl_lock); 457 status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
458 status = (test_bit(ref->rc_slotid, tbl->used_slots) && 458 ref->rc_sequenceid, HZ >> 1) < 0;
459 tbl->slots[ref->rc_slotid].seq_nr ==
460 ref->rc_sequenceid);
461 spin_unlock(&tbl->slot_tbl_lock);
462 if (status) 459 if (status)
463 goto out; 460 goto out;
464 } 461 }
@@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
487 goto out; 484 goto out;
488 485
489 tbl = &clp->cl_session->bc_slot_table; 486 tbl = &clp->cl_session->bc_slot_table;
490 slot = tbl->slots + args->csa_slotid;
491 487
492 /* Set up res before grabbing the spinlock */ 488 /* Set up res before grabbing the spinlock */
493 memcpy(&res->csr_sessionid, &args->csa_sessionid, 489 memcpy(&res->csr_sessionid, &args->csa_sessionid,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 003ebce4bbc4..1e106780a237 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
426 * Initialise the timeout values for a connection 426 * Initialise the timeout values for a connection
427 */ 427 */
428void nfs_init_timeout_values(struct rpc_timeout *to, int proto, 428void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
429 unsigned int timeo, unsigned int retrans) 429 int timeo, int retrans)
430{ 430{
431 to->to_initval = timeo * HZ / 10; 431 to->to_initval = timeo * HZ / 10;
432 to->to_retries = retrans; 432 to->to_retries = retrans;
@@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
434 switch (proto) { 434 switch (proto) {
435 case XPRT_TRANSPORT_TCP: 435 case XPRT_TRANSPORT_TCP:
436 case XPRT_TRANSPORT_RDMA: 436 case XPRT_TRANSPORT_RDMA:
437 if (to->to_retries == 0) 437 if (retrans == NFS_UNSPEC_RETRANS)
438 to->to_retries = NFS_DEF_TCP_RETRANS; 438 to->to_retries = NFS_DEF_TCP_RETRANS;
439 if (to->to_initval == 0) 439 if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
440 to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; 440 to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
441 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) 441 if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
442 to->to_initval = NFS_MAX_TCP_TIMEOUT; 442 to->to_initval = NFS_MAX_TCP_TIMEOUT;
@@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
449 to->to_exponential = 0; 449 to->to_exponential = 0;
450 break; 450 break;
451 case XPRT_TRANSPORT_UDP: 451 case XPRT_TRANSPORT_UDP:
452 if (to->to_retries == 0) 452 if (retrans == NFS_UNSPEC_RETRANS)
453 to->to_retries = NFS_DEF_UDP_RETRANS; 453 to->to_retries = NFS_DEF_UDP_RETRANS;
454 if (!to->to_initval) 454 if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
455 to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10; 455 to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
456 if (to->to_initval > NFS_MAX_UDP_TIMEOUT) 456 if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
457 to->to_initval = NFS_MAX_UDP_TIMEOUT; 457 to->to_initval = NFS_MAX_UDP_TIMEOUT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7d620970f2e1..ca699ddc11c1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
657 if (result <= 0) 657 if (result <= 0)
658 goto out; 658 goto out;
659 659
660 written = generic_write_sync(iocb, result); 660 result = generic_write_sync(iocb, result);
661 if (result < 0)
662 goto out;
663 written = result;
661 iocb->ki_pos += written; 664 iocb->ki_pos += written;
662 665
663 /* Return error values */ 666 /* Return error values */
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e6206eaf2bdf..51b51369704c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
37 if (ffl) { 37 if (ffl) {
38 INIT_LIST_HEAD(&ffl->error_list); 38 INIT_LIST_HEAD(&ffl->error_list);
39 INIT_LIST_HEAD(&ffl->mirrors); 39 INIT_LIST_HEAD(&ffl->mirrors);
40 ffl->last_report_time = ktime_get();
40 return &ffl->generic_hdr; 41 return &ffl->generic_hdr;
41 } else 42 } else
42 return NULL; 43 return NULL;
@@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
640{ 641{
641 static const ktime_t notime = {0}; 642 static const ktime_t notime = {0};
642 s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; 643 s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
644 struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
643 645
644 nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); 646 nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
645 if (ktime_equal(mirror->start_time, notime)) 647 if (ktime_equal(mirror->start_time, notime))
646 mirror->start_time = now; 648 mirror->start_time = now;
647 if (ktime_equal(mirror->last_report_time, notime))
648 mirror->last_report_time = now;
649 if (mirror->report_interval != 0) 649 if (mirror->report_interval != 0)
650 report_interval = (s64)mirror->report_interval * 1000LL; 650 report_interval = (s64)mirror->report_interval * 1000LL;
651 else if (layoutstats_timer != 0) 651 else if (layoutstats_timer != 0)
652 report_interval = (s64)layoutstats_timer * 1000LL; 652 report_interval = (s64)layoutstats_timer * 1000LL;
653 if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= 653 if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
654 report_interval) { 654 report_interval) {
655 mirror->last_report_time = now; 655 ffl->last_report_time = now;
656 return true; 656 return true;
657 } 657 }
658 658
@@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
806{ 806{
807 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); 807 struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
808 struct nfs4_pnfs_ds *ds; 808 struct nfs4_pnfs_ds *ds;
809 bool fail_return = false;
809 int idx; 810 int idx;
810 811
811 /* mirrors are sorted by efficiency */ 812 /* mirrors are sorted by efficiency */
812 for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { 813 for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
813 ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); 814 if (idx+1 == fls->mirror_array_cnt)
815 fail_return = true;
816 ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
814 if (ds) { 817 if (ds) {
815 *best_idx = idx; 818 *best_idx = idx;
816 return ds; 819 return ds;
@@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
859 struct nfs4_pnfs_ds *ds; 862 struct nfs4_pnfs_ds *ds;
860 int ds_idx; 863 int ds_idx;
861 864
865retry:
862 /* Use full layout for now */ 866 /* Use full layout for now */
863 if (!pgio->pg_lseg) 867 if (!pgio->pg_lseg)
864 ff_layout_pg_get_read(pgio, req, false); 868 ff_layout_pg_get_read(pgio, req, false);
@@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
871 875
872 ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); 876 ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
873 if (!ds) { 877 if (!ds) {
874 if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) 878 if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
875 goto out_pnfs;
876 else
877 goto out_mds; 879 goto out_mds;
880 pnfs_put_lseg(pgio->pg_lseg);
881 pgio->pg_lseg = NULL;
882 /* Sleep for 1 second before retrying */
883 ssleep(1);
884 goto retry;
878 } 885 }
879 886
880 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); 887 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -890,12 +897,6 @@ out_mds:
890 pnfs_put_lseg(pgio->pg_lseg); 897 pnfs_put_lseg(pgio->pg_lseg);
891 pgio->pg_lseg = NULL; 898 pgio->pg_lseg = NULL;
892 nfs_pageio_reset_read_mds(pgio); 899 nfs_pageio_reset_read_mds(pgio);
893 return;
894
895out_pnfs:
896 pnfs_set_lo_fail(pgio->pg_lseg);
897 pnfs_put_lseg(pgio->pg_lseg);
898 pgio->pg_lseg = NULL;
899} 900}
900 901
901static void 902static void
@@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
909 int i; 910 int i;
910 int status; 911 int status;
911 912
913retry:
912 if (!pgio->pg_lseg) { 914 if (!pgio->pg_lseg) {
913 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 915 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
914 req->wb_context, 916 req->wb_context,
@@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
940 for (i = 0; i < pgio->pg_mirror_count; i++) { 942 for (i = 0; i < pgio->pg_mirror_count; i++) {
941 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); 943 ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
942 if (!ds) { 944 if (!ds) {
943 if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) 945 if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
944 goto out_pnfs;
945 else
946 goto out_mds; 946 goto out_mds;
947 pnfs_put_lseg(pgio->pg_lseg);
948 pgio->pg_lseg = NULL;
949 /* Sleep for 1 second before retrying */
950 ssleep(1);
951 goto retry;
947 } 952 }
948 pgm = &pgio->pg_mirrors[i]; 953 pgm = &pgio->pg_mirrors[i];
949 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); 954 mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -956,12 +961,6 @@ out_mds:
956 pnfs_put_lseg(pgio->pg_lseg); 961 pnfs_put_lseg(pgio->pg_lseg);
957 pgio->pg_lseg = NULL; 962 pgio->pg_lseg = NULL;
958 nfs_pageio_reset_write_mds(pgio); 963 nfs_pageio_reset_write_mds(pgio);
959 return;
960
961out_pnfs:
962 pnfs_set_lo_fail(pgio->pg_lseg);
963 pnfs_put_lseg(pgio->pg_lseg);
964 pgio->pg_lseg = NULL;
965} 964}
966 965
967static unsigned int 966static unsigned int
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 1bcdb15d0c41..3ee0c9fcea76 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror {
84 struct nfs4_ff_layoutstat read_stat; 84 struct nfs4_ff_layoutstat read_stat;
85 struct nfs4_ff_layoutstat write_stat; 85 struct nfs4_ff_layoutstat write_stat;
86 ktime_t start_time; 86 ktime_t start_time;
87 ktime_t last_report_time;
88 u32 report_interval; 87 u32 report_interval;
89}; 88};
90 89
@@ -101,6 +100,7 @@ struct nfs4_flexfile_layout {
101 struct pnfs_ds_commit_info commit_info; 100 struct pnfs_ds_commit_info commit_info;
102 struct list_head mirrors; 101 struct list_head mirrors;
103 struct list_head error_list; /* nfs4_ff_layout_ds_err */ 102 struct list_head error_list; /* nfs4_ff_layout_ds_err */
103 ktime_t last_report_time; /* Layoutstat report times */
104}; 104};
105 105
106static inline struct nfs4_flexfile_layout * 106static inline struct nfs4_flexfile_layout *
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 0aa36be71fce..f7a3f6b05369 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -17,8 +17,8 @@
17 17
18#define NFSDBG_FACILITY NFSDBG_PNFS_LD 18#define NFSDBG_FACILITY NFSDBG_PNFS_LD
19 19
20static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; 20static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
21static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; 21static unsigned int dataserver_retrans;
22 22
23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) 23void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
24{ 24{
@@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
379 379
380 devid = &mirror->mirror_ds->id_node; 380 devid = &mirror->mirror_ds->id_node;
381 if (ff_layout_test_devid_unavailable(devid)) 381 if (ff_layout_test_devid_unavailable(devid))
382 goto out; 382 goto out_fail;
383 383
384 ds = mirror->mirror_ds->ds; 384 ds = mirror->mirror_ds->ds;
385 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ 385 /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
405 mirror->mirror_ds->ds_versions[0].rsize = max_payload; 405 mirror->mirror_ds->ds_versions[0].rsize = max_payload;
406 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) 406 if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
407 mirror->mirror_ds->ds_versions[0].wsize = max_payload; 407 mirror->mirror_ds->ds_versions[0].wsize = max_payload;
408 } else { 408 goto out;
409 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
410 mirror, lseg->pls_range.offset,
411 lseg->pls_range.length, NFS4ERR_NXIO,
412 OP_ILLEGAL, GFP_NOIO);
413 if (fail_return || !ff_layout_has_available_ds(lseg))
414 pnfs_error_mark_layout_for_return(ino, lseg);
415 ds = NULL;
416 } 409 }
410 ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
411 mirror, lseg->pls_range.offset,
412 lseg->pls_range.length, NFS4ERR_NXIO,
413 OP_ILLEGAL, GFP_NOIO);
414out_fail:
415 if (fail_return || !ff_layout_has_available_ds(lseg))
416 pnfs_error_mark_layout_for_return(ino, lseg);
417 ds = NULL;
417out: 418out:
418 return ds; 419 return ds;
419} 420}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7ce5e023c3c3..74935a19e4bf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,6 +58,9 @@ struct nfs_clone_mount {
58 */ 58 */
59#define NFS_UNSPEC_PORT (-1) 59#define NFS_UNSPEC_PORT (-1)
60 60
61#define NFS_UNSPEC_RETRANS (UINT_MAX)
62#define NFS_UNSPEC_TIMEO (UINT_MAX)
63
61/* 64/*
62 * Maximum number of pages that readdir can use for creating 65 * Maximum number of pages that readdir can use for creating
63 * a vmapped array of pages. 66 * a vmapped array of pages.
@@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
156int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); 159int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
157void nfs_server_insert_lists(struct nfs_server *); 160void nfs_server_insert_lists(struct nfs_server *);
158void nfs_server_remove_lists(struct nfs_server *); 161void nfs_server_remove_lists(struct nfs_server *);
159void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int); 162void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
160int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, 163int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
161 rpc_authflavor_t); 164 rpc_authflavor_t);
162struct nfs_server *nfs_alloc_server(void); 165struct nfs_server *nfs_alloc_server(void);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 33da841a21bb..64b43b4ad9dd 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -318,10 +318,22 @@ static void
318nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) 318nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
319{ 319{
320 struct nfs42_layoutstat_data *data = calldata; 320 struct nfs42_layoutstat_data *data = calldata;
321 struct nfs_server *server = NFS_SERVER(data->args.inode); 321 struct inode *inode = data->inode;
322 struct nfs_server *server = NFS_SERVER(inode);
323 struct pnfs_layout_hdr *lo;
322 324
325 spin_lock(&inode->i_lock);
326 lo = NFS_I(inode)->layout;
327 if (!pnfs_layout_is_valid(lo)) {
328 spin_unlock(&inode->i_lock);
329 rpc_exit(task, 0);
330 return;
331 }
332 nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
333 spin_unlock(&inode->i_lock);
323 nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, 334 nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
324 &data->res.seq_res, task); 335 &data->res.seq_res, task);
336
325} 337}
326 338
327static void 339static void
@@ -338,12 +350,14 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
338 case 0: 350 case 0:
339 break; 351 break;
340 case -NFS4ERR_EXPIRED: 352 case -NFS4ERR_EXPIRED:
353 case -NFS4ERR_ADMIN_REVOKED:
354 case -NFS4ERR_DELEG_REVOKED:
341 case -NFS4ERR_STALE_STATEID: 355 case -NFS4ERR_STALE_STATEID:
342 case -NFS4ERR_OLD_STATEID:
343 case -NFS4ERR_BAD_STATEID: 356 case -NFS4ERR_BAD_STATEID:
344 spin_lock(&inode->i_lock); 357 spin_lock(&inode->i_lock);
345 lo = NFS_I(inode)->layout; 358 lo = NFS_I(inode)->layout;
346 if (lo && nfs4_stateid_match(&data->args.stateid, 359 if (pnfs_layout_is_valid(lo) &&
360 nfs4_stateid_match(&data->args.stateid,
347 &lo->plh_stateid)) { 361 &lo->plh_stateid)) {
348 LIST_HEAD(head); 362 LIST_HEAD(head);
349 363
@@ -357,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
357 } else 371 } else
358 spin_unlock(&inode->i_lock); 372 spin_unlock(&inode->i_lock);
359 break; 373 break;
374 case -NFS4ERR_OLD_STATEID:
375 spin_lock(&inode->i_lock);
376 lo = NFS_I(inode)->layout;
377 if (pnfs_layout_is_valid(lo) &&
378 nfs4_stateid_match_other(&data->args.stateid,
379 &lo->plh_stateid)) {
380 /* Do we need to delay before resending? */
381 if (!nfs4_stateid_is_newer(&lo->plh_stateid,
382 &data->args.stateid))
383 rpc_delay(task, HZ);
384 rpc_restart_call_prepare(task);
385 }
386 spin_unlock(&inode->i_lock);
387 break;
360 case -ENOTSUPP: 388 case -ENOTSUPP:
361 case -EOPNOTSUPP: 389 case -EOPNOTSUPP:
362 NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; 390 NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
363 default:
364 break;
365 } 391 }
366 392
367 dprintk("%s server returns %d\n", __func__, task->tk_status); 393 dprintk("%s server returns %d\n", __func__, task->tk_status);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 324bfdc21250..9bf64eacba5b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -396,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
396extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); 396extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
397extern void nfs4_kill_renewd(struct nfs_client *); 397extern void nfs4_kill_renewd(struct nfs_client *);
398extern void nfs4_renew_state(struct work_struct *); 398extern void nfs4_renew_state(struct work_struct *);
399extern void nfs4_set_lease_period(struct nfs_client *clp,
400 unsigned long lease,
401 unsigned long lastrenewed);
402
399 403
400/* nfs4state.c */ 404/* nfs4state.c */
401struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp); 405struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8d7d08d4f95f..cd3b7cfdde16 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server,
817 goto error; 817 goto error;
818 } 818 }
819 819
820 if (server->nfs_client == clp) {
821 error = -ELOOP;
822 goto error;
823 }
824
820 /* 825 /*
821 * Query for the lease time on clientid setup or renewal 826 * Query for the lease time on clientid setup or renewal
822 * 827 *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a036e93bdf96..a9dec32ba9ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -634,15 +634,11 @@ out_sleep:
634} 634}
635EXPORT_SYMBOL_GPL(nfs40_setup_sequence); 635EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
636 636
637static int nfs40_sequence_done(struct rpc_task *task, 637static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
638 struct nfs4_sequence_res *res)
639{ 638{
640 struct nfs4_slot *slot = res->sr_slot; 639 struct nfs4_slot *slot = res->sr_slot;
641 struct nfs4_slot_table *tbl; 640 struct nfs4_slot_table *tbl;
642 641
643 if (slot == NULL)
644 goto out;
645
646 tbl = slot->table; 642 tbl = slot->table;
647 spin_lock(&tbl->slot_tbl_lock); 643 spin_lock(&tbl->slot_tbl_lock);
648 if (!nfs41_wake_and_assign_slot(tbl, slot)) 644 if (!nfs41_wake_and_assign_slot(tbl, slot))
@@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task,
650 spin_unlock(&tbl->slot_tbl_lock); 646 spin_unlock(&tbl->slot_tbl_lock);
651 647
652 res->sr_slot = NULL; 648 res->sr_slot = NULL;
653out: 649}
650
651static int nfs40_sequence_done(struct rpc_task *task,
652 struct nfs4_sequence_res *res)
653{
654 if (res->sr_slot != NULL)
655 nfs40_sequence_free_slot(res);
654 return 1; 656 return 1;
655} 657}
656 658
@@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
666 tbl = slot->table; 668 tbl = slot->table;
667 session = tbl->session; 669 session = tbl->session;
668 670
671 /* Bump the slot sequence number */
672 if (slot->seq_done)
673 slot->seq_nr++;
674 slot->seq_done = 0;
675
669 spin_lock(&tbl->slot_tbl_lock); 676 spin_lock(&tbl->slot_tbl_lock);
670 /* Be nice to the server: try to ensure that the last transmitted 677 /* Be nice to the server: try to ensure that the last transmitted
671 * value for highest_user_slotid <= target_highest_slotid 678 * value for highest_user_slotid <= target_highest_slotid
@@ -686,9 +693,12 @@ out_unlock:
686 res->sr_slot = NULL; 693 res->sr_slot = NULL;
687 if (send_new_highest_used_slotid) 694 if (send_new_highest_used_slotid)
688 nfs41_notify_server(session->clp); 695 nfs41_notify_server(session->clp);
696 if (waitqueue_active(&tbl->slot_waitq))
697 wake_up_all(&tbl->slot_waitq);
689} 698}
690 699
691int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 700static int nfs41_sequence_process(struct rpc_task *task,
701 struct nfs4_sequence_res *res)
692{ 702{
693 struct nfs4_session *session; 703 struct nfs4_session *session;
694 struct nfs4_slot *slot = res->sr_slot; 704 struct nfs4_slot *slot = res->sr_slot;
@@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
714 switch (res->sr_status) { 724 switch (res->sr_status) {
715 case 0: 725 case 0:
716 /* Update the slot's sequence and clientid lease timer */ 726 /* Update the slot's sequence and clientid lease timer */
717 ++slot->seq_nr; 727 slot->seq_done = 1;
718 clp = session->clp; 728 clp = session->clp;
719 do_renew_lease(clp, res->sr_timestamp); 729 do_renew_lease(clp, res->sr_timestamp);
720 /* Check sequence flags */ 730 /* Check sequence flags */
@@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
769 goto retry_nowait; 779 goto retry_nowait;
770 default: 780 default:
771 /* Just update the slot sequence no. */ 781 /* Just update the slot sequence no. */
772 ++slot->seq_nr; 782 slot->seq_done = 1;
773 } 783 }
774out: 784out:
775 /* The session may be reset by one of the error handlers. */ 785 /* The session may be reset by one of the error handlers. */
776 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 786 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
777 nfs41_sequence_free_slot(res);
778out_noaction: 787out_noaction:
779 return ret; 788 return ret;
780retry_nowait: 789retry_nowait:
781 if (rpc_restart_call_prepare(task)) { 790 if (rpc_restart_call_prepare(task)) {
791 nfs41_sequence_free_slot(res);
782 task->tk_status = 0; 792 task->tk_status = 0;
783 ret = 0; 793 ret = 0;
784 } 794 }
@@ -789,8 +799,37 @@ out_retry:
789 rpc_delay(task, NFS4_POLL_RETRY_MAX); 799 rpc_delay(task, NFS4_POLL_RETRY_MAX);
790 return 0; 800 return 0;
791} 801}
802
803int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
804{
805 if (!nfs41_sequence_process(task, res))
806 return 0;
807 if (res->sr_slot != NULL)
808 nfs41_sequence_free_slot(res);
809 return 1;
810
811}
792EXPORT_SYMBOL_GPL(nfs41_sequence_done); 812EXPORT_SYMBOL_GPL(nfs41_sequence_done);
793 813
814static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
815{
816 if (res->sr_slot == NULL)
817 return 1;
818 if (res->sr_slot->table->session != NULL)
819 return nfs41_sequence_process(task, res);
820 return nfs40_sequence_done(task, res);
821}
822
823static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
824{
825 if (res->sr_slot != NULL) {
826 if (res->sr_slot->table->session != NULL)
827 nfs41_sequence_free_slot(res);
828 else
829 nfs40_sequence_free_slot(res);
830 }
831}
832
794int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) 833int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
795{ 834{
796 if (res->sr_slot == NULL) 835 if (res->sr_slot == NULL)
@@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
920 args, res, task); 959 args, res, task);
921} 960}
922 961
962static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
963{
964 return nfs40_sequence_done(task, res);
965}
966
967static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
968{
969 if (res->sr_slot != NULL)
970 nfs40_sequence_free_slot(res);
971}
972
923int nfs4_sequence_done(struct rpc_task *task, 973int nfs4_sequence_done(struct rpc_task *task,
924 struct nfs4_sequence_res *res) 974 struct nfs4_sequence_res *res)
925{ 975{
@@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref)
1197 struct super_block *sb = p->dentry->d_sb; 1247 struct super_block *sb = p->dentry->d_sb;
1198 1248
1199 nfs_free_seqid(p->o_arg.seqid); 1249 nfs_free_seqid(p->o_arg.seqid);
1250 nfs4_sequence_free_slot(&p->o_res.seq_res);
1200 if (p->state != NULL) 1251 if (p->state != NULL)
1201 nfs4_put_open_state(p->state); 1252 nfs4_put_open_state(p->state);
1202 nfs4_put_state_owner(p->owner); 1253 nfs4_put_state_owner(p->owner);
@@ -1656,9 +1707,14 @@ err:
1656static struct nfs4_state * 1707static struct nfs4_state *
1657nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) 1708nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
1658{ 1709{
1710 struct nfs4_state *ret;
1711
1659 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) 1712 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
1660 return _nfs4_opendata_reclaim_to_nfs4_state(data); 1713 ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
1661 return _nfs4_opendata_to_nfs4_state(data); 1714 else
1715 ret = _nfs4_opendata_to_nfs4_state(data);
1716 nfs4_sequence_free_slot(&data->o_res.seq_res);
1717 return ret;
1662} 1718}
1663 1719
1664static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) 1720static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
@@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
2056 2112
2057 data->rpc_status = task->tk_status; 2113 data->rpc_status = task->tk_status;
2058 2114
2059 if (!nfs4_sequence_done(task, &data->o_res.seq_res)) 2115 if (!nfs4_sequence_process(task, &data->o_res.seq_res))
2060 return; 2116 return;
2061 2117
2062 if (task->tk_status == 0) { 2118 if (task->tk_status == 0) {
@@ -4237,12 +4293,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
4237 err = _nfs4_do_fsinfo(server, fhandle, fsinfo); 4293 err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
4238 trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err); 4294 trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
4239 if (err == 0) { 4295 if (err == 0) {
4240 struct nfs_client *clp = server->nfs_client; 4296 nfs4_set_lease_period(server->nfs_client,
4241 4297 fsinfo->lease_time * HZ,
4242 spin_lock(&clp->cl_lock); 4298 now);
4243 clp->cl_lease_time = fsinfo->lease_time * HZ;
4244 clp->cl_last_renewal = now;
4245 spin_unlock(&clp->cl_lock);
4246 break; 4299 break;
4247 } 4300 }
4248 err = nfs4_handle_exception(server, err, &exception); 4301 err = nfs4_handle_exception(server, err, &exception);
@@ -7517,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
7517 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 7570 status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
7518 trace_nfs4_create_session(clp, status); 7571 trace_nfs4_create_session(clp, status);
7519 7572
7573 switch (status) {
7574 case -NFS4ERR_STALE_CLIENTID:
7575 case -NFS4ERR_DELAY:
7576 case -ETIMEDOUT:
7577 case -EACCES:
7578 case -EAGAIN:
7579 goto out;
7580 };
7581
7582 clp->cl_seqid++;
7520 if (!status) { 7583 if (!status) {
7521 /* Verify the session's negotiated channel_attrs values */ 7584 /* Verify the session's negotiated channel_attrs values */
7522 status = nfs4_verify_channel_attrs(&args, &res); 7585 status = nfs4_verify_channel_attrs(&args, &res);
7523 /* Increment the clientid slot sequence id */ 7586 /* Increment the clientid slot sequence id */
7524 if (clp->cl_seqid == res.seqid)
7525 clp->cl_seqid++;
7526 if (status) 7587 if (status)
7527 goto out; 7588 goto out;
7528 nfs4_update_session(session, &res); 7589 nfs4_update_session(session, &res);
@@ -7867,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7867 struct nfs4_layoutget *lgp = calldata; 7928 struct nfs4_layoutget *lgp = calldata;
7868 7929
7869 dprintk("--> %s\n", __func__); 7930 dprintk("--> %s\n", __func__);
7870 nfs41_sequence_done(task, &lgp->res.seq_res); 7931 nfs41_sequence_process(task, &lgp->res.seq_res);
7871 dprintk("<-- %s\n", __func__); 7932 dprintk("<-- %s\n", __func__);
7872} 7933}
7873 7934
@@ -8083,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
8083 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ 8144 /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
8084 if (status == 0 && lgp->res.layoutp->len) 8145 if (status == 0 && lgp->res.layoutp->len)
8085 lseg = pnfs_layout_process(lgp); 8146 lseg = pnfs_layout_process(lgp);
8147 nfs4_sequence_free_slot(&lgp->res.seq_res);
8086 rpc_put_task(task); 8148 rpc_put_task(task);
8087 dprintk("<-- %s status=%d\n", __func__, status); 8149 dprintk("<-- %s status=%d\n", __func__, status);
8088 if (status) 8150 if (status)
@@ -8109,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
8109 8171
8110 dprintk("--> %s\n", __func__); 8172 dprintk("--> %s\n", __func__);
8111 8173
8112 if (!nfs41_sequence_done(task, &lrp->res.seq_res)) 8174 if (!nfs41_sequence_process(task, &lrp->res.seq_res))
8113 return; 8175 return;
8114 8176
8115 server = NFS_SERVER(lrp->args.inode); 8177 server = NFS_SERVER(lrp->args.inode);
@@ -8121,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
8121 case -NFS4ERR_DELAY: 8183 case -NFS4ERR_DELAY:
8122 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) 8184 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
8123 break; 8185 break;
8186 nfs4_sequence_free_slot(&lrp->res.seq_res);
8124 rpc_restart_call_prepare(task); 8187 rpc_restart_call_prepare(task);
8125 return; 8188 return;
8126 } 8189 }
@@ -8135,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata)
8135 8198
8136 dprintk("--> %s\n", __func__); 8199 dprintk("--> %s\n", __func__);
8137 spin_lock(&lo->plh_inode->i_lock); 8200 spin_lock(&lo->plh_inode->i_lock);
8138 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, 8201 if (lrp->res.lrs_present) {
8139 be32_to_cpu(lrp->args.stateid.seqid)); 8202 pnfs_mark_matching_lsegs_invalid(lo, &freeme,
8140 if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) 8203 &lrp->args.range,
8204 be32_to_cpu(lrp->args.stateid.seqid));
8141 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 8205 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
8206 } else
8207 pnfs_mark_layout_stateid_invalid(lo, &freeme);
8142 pnfs_clear_layoutreturn_waitbit(lo); 8208 pnfs_clear_layoutreturn_waitbit(lo);
8143 spin_unlock(&lo->plh_inode->i_lock); 8209 spin_unlock(&lo->plh_inode->i_lock);
8210 nfs4_sequence_free_slot(&lrp->res.seq_res);
8144 pnfs_free_lseg_list(&freeme); 8211 pnfs_free_lseg_list(&freeme);
8145 pnfs_put_layout_hdr(lrp->args.layout); 8212 pnfs_put_layout_hdr(lrp->args.layout);
8146 nfs_iput_and_deactive(lrp->inode); 8213 nfs_iput_and_deactive(lrp->inode);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e1ba58c3d1ad..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp)
136 cancel_delayed_work_sync(&clp->cl_renewd); 136 cancel_delayed_work_sync(&clp->cl_renewd);
137} 137}
138 138
139/**
140 * nfs4_set_lease_period - Sets the lease period on a nfs_client
141 *
142 * @clp: pointer to nfs_client
143 * @lease: new value for lease period
144 * @lastrenewed: time at which lease was last renewed
145 */
146void nfs4_set_lease_period(struct nfs_client *clp,
147 unsigned long lease,
148 unsigned long lastrenewed)
149{
150 spin_lock(&clp->cl_lock);
151 clp->cl_lease_time = lease;
152 clp->cl_last_renewal = lastrenewed;
153 spin_unlock(&clp->cl_lock);
154
155 /* Cap maximum reconnect timeout at 1/2 lease period */
156 rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
157}
158
139/* 159/*
140 * Local variables: 160 * Local variables:
141 * c-basic-offset: 8 161 * c-basic-offset: 8
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 332d06e64fa9..b62973045a3e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
28 tbl->highest_used_slotid = NFS4_NO_SLOT; 28 tbl->highest_used_slotid = NFS4_NO_SLOT;
29 spin_lock_init(&tbl->slot_tbl_lock); 29 spin_lock_init(&tbl->slot_tbl_lock);
30 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue); 30 rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
31 init_waitqueue_head(&tbl->slot_waitq);
31 init_completion(&tbl->complete); 32 init_completion(&tbl->complete);
32} 33}
33 34
@@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
172 return ERR_PTR(-E2BIG); 173 return ERR_PTR(-E2BIG);
173} 174}
174 175
176static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid,
177 u32 *seq_nr)
178 __must_hold(&tbl->slot_tbl_lock)
179{
180 struct nfs4_slot *slot;
181
182 slot = nfs4_lookup_slot(tbl, slotid);
183 if (IS_ERR(slot))
184 return PTR_ERR(slot);
185 *seq_nr = slot->seq_nr;
186 return 0;
187}
188
189/*
190 * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
191 *
192 * Given a slot table, slot id and sequence number, determine if the
193 * RPC call in question is still in flight. This function is mainly
194 * intended for use by the callback channel.
195 */
196static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
197 u32 slotid, u32 seq_nr)
198{
199 u32 cur_seq;
200 bool ret = false;
201
202 spin_lock(&tbl->slot_tbl_lock);
203 if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
204 cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
205 ret = true;
206 spin_unlock(&tbl->slot_tbl_lock);
207 return ret;
208}
209
210/*
211 * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
212 *
213 * Given a slot table, slot id and sequence number, wait until the
214 * corresponding RPC call completes. This function is mainly
215 * intended for use by the callback channel.
216 */
217int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
218 u32 slotid, u32 seq_nr,
219 unsigned long timeout)
220{
221 if (wait_event_timeout(tbl->slot_waitq,
222 !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
223 timeout) == 0)
224 return -ETIMEDOUT;
225 return 0;
226}
227
175/* 228/*
176 * nfs4_alloc_slot - efficiently look for a free slot 229 * nfs4_alloc_slot - efficiently look for a free slot
177 * 230 *
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 5b51298d1d03..f703b755351b 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -21,7 +21,8 @@ struct nfs4_slot {
21 unsigned long generation; 21 unsigned long generation;
22 u32 slot_nr; 22 u32 slot_nr;
23 u32 seq_nr; 23 u32 seq_nr;
24 unsigned int interrupted : 1; 24 unsigned int interrupted : 1,
25 seq_done : 1;
25}; 26};
26 27
27/* Sessions */ 28/* Sessions */
@@ -36,6 +37,7 @@ struct nfs4_slot_table {
36 unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */ 37 unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
37 spinlock_t slot_tbl_lock; 38 spinlock_t slot_tbl_lock;
38 struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */ 39 struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
40 wait_queue_head_t slot_waitq; /* Completion wait on slot */
39 u32 max_slots; /* # slots in table */ 41 u32 max_slots; /* # slots in table */
40 u32 max_slotid; /* Max allowed slotid value */ 42 u32 max_slotid; /* Max allowed slotid value */
41 u32 highest_used_slotid; /* sent to server on each SEQ. 43 u32 highest_used_slotid; /* sent to server on each SEQ.
@@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
78extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl); 80extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
79extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl); 81extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
80extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid); 82extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
83extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
84 u32 slotid, u32 seq_nr,
85 unsigned long timeout);
81extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); 86extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
82extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot); 87extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
83extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); 88extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 834b875900d6..cada00aa5096 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
277{ 277{
278 int status; 278 int status;
279 struct nfs_fsinfo fsinfo; 279 struct nfs_fsinfo fsinfo;
280 unsigned long now;
280 281
281 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { 282 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
282 nfs4_schedule_state_renewal(clp); 283 nfs4_schedule_state_renewal(clp);
283 return 0; 284 return 0;
284 } 285 }
285 286
287 now = jiffies;
286 status = nfs4_proc_get_lease_time(clp, &fsinfo); 288 status = nfs4_proc_get_lease_time(clp, &fsinfo);
287 if (status == 0) { 289 if (status == 0) {
288 /* Update lease time and schedule renewal */ 290 nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
289 spin_lock(&clp->cl_lock);
290 clp->cl_lease_time = fsinfo.lease_time * HZ;
291 clp->cl_last_renewal = jiffies;
292 spin_unlock(&clp->cl_lock);
293
294 nfs4_schedule_state_renewal(clp); 291 nfs4_schedule_state_renewal(clp);
295 } 292 }
296 293
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70806cae0d36..2c93a85eda51 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
365 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ 365 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
366 atomic_dec(&lo->plh_refcount); 366 atomic_dec(&lo->plh_refcount);
367 if (list_empty(&lo->plh_segs)) { 367 if (list_empty(&lo->plh_segs)) {
368 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 368 if (atomic_read(&lo->plh_outstanding) == 0)
369 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
369 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 370 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
370 } 371 }
371 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); 372 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
@@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
768 pnfs_destroy_layouts_byclid(clp, false); 769 pnfs_destroy_layouts_byclid(clp, false);
769} 770}
770 771
772static void
773pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
774{
775 lo->plh_return_iomode = 0;
776 lo->plh_return_seq = 0;
777 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
778}
779
771/* update lo->plh_stateid with new if is more recent */ 780/* update lo->plh_stateid with new if is more recent */
772void 781void
773pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 782pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
774 bool update_barrier) 783 bool update_barrier)
775{ 784{
776 u32 oldseq, newseq, new_barrier = 0; 785 u32 oldseq, newseq, new_barrier = 0;
777 bool invalid = !pnfs_layout_is_valid(lo);
778 786
779 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 787 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
780 newseq = be32_to_cpu(new->seqid); 788 newseq = be32_to_cpu(new->seqid);
781 if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { 789
790 if (!pnfs_layout_is_valid(lo)) {
791 nfs4_stateid_copy(&lo->plh_stateid, new);
792 lo->plh_barrier = newseq;
793 pnfs_clear_layoutreturn_info(lo);
794 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
795 return;
796 }
797 if (pnfs_seqid_is_newer(newseq, oldseq)) {
782 nfs4_stateid_copy(&lo->plh_stateid, new); 798 nfs4_stateid_copy(&lo->plh_stateid, new);
783 /* 799 /*
784 * Because of wraparound, we want to keep the barrier 800 * Because of wraparound, we want to keep the barrier
@@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
790 new_barrier = be32_to_cpu(new->seqid); 806 new_barrier = be32_to_cpu(new->seqid);
791 else if (new_barrier == 0) 807 else if (new_barrier == 0)
792 return; 808 return;
793 if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 809 if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
794 lo->plh_barrier = new_barrier; 810 lo->plh_barrier = new_barrier;
795} 811}
796 812
@@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
886 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); 902 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
887} 903}
888 904
889static void
890pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
891{
892 lo->plh_return_iomode = 0;
893 lo->plh_return_seq = 0;
894 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
895}
896
897static bool 905static bool
898pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, 906pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
899 nfs4_stateid *stateid, 907 nfs4_stateid *stateid,
900 enum pnfs_iomode *iomode) 908 enum pnfs_iomode *iomode)
901{ 909{
910 /* Serialise LAYOUTGET/LAYOUTRETURN */
911 if (atomic_read(&lo->plh_outstanding) != 0)
912 return false;
902 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 913 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
903 return false; 914 return false;
904 pnfs_get_layout_hdr(lo); 915 pnfs_get_layout_hdr(lo);
@@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino,
1555 } 1566 }
1556 1567
1557lookup_again: 1568lookup_again:
1569 nfs4_client_recover_expired_lease(clp);
1558 first = false; 1570 first = false;
1559 spin_lock(&ino->i_lock); 1571 spin_lock(&ino->i_lock);
1560 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 1572 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
@@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1797 */ 1809 */
1798 pnfs_mark_layout_stateid_invalid(lo, &free_me); 1810 pnfs_mark_layout_stateid_invalid(lo, &free_me);
1799 1811
1800 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); 1812 pnfs_set_layout_stateid(lo, &res->stateid, true);
1801 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1802 } 1813 }
1803 1814
1804 pnfs_get_lseg(lseg); 1815 pnfs_get_lseg(lseg);
1805 pnfs_layout_insert_lseg(lo, lseg, &free_me); 1816 pnfs_layout_insert_lseg(lo, lseg, &free_me);
1806 if (!pnfs_layout_is_valid(lo)) {
1807 pnfs_clear_layoutreturn_info(lo);
1808 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1809 }
1810 1817
1811 1818
1812 if (res->return_on_close) 1819 if (res->return_on_close)
@@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
2510 2517
2511 data->args.fh = NFS_FH(inode); 2518 data->args.fh = NFS_FH(inode);
2512 data->args.inode = inode; 2519 data->args.inode = inode;
2513 nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
2514 status = ld->prepare_layoutstats(&data->args); 2520 status = ld->prepare_layoutstats(&data->args);
2515 if (status) 2521 if (status)
2516 goto out_free; 2522 goto out_free;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 18d446e1a82b..d39601381adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
923 923
924 data = kzalloc(sizeof(*data), GFP_KERNEL); 924 data = kzalloc(sizeof(*data), GFP_KERNEL);
925 if (data) { 925 if (data) {
926 data->timeo = NFS_UNSPEC_TIMEO;
927 data->retrans = NFS_UNSPEC_RETRANS;
926 data->acregmin = NFS_DEF_ACREGMIN; 928 data->acregmin = NFS_DEF_ACREGMIN;
927 data->acregmax = NFS_DEF_ACREGMAX; 929 data->acregmax = NFS_DEF_ACREGMAX;
928 data->acdirmin = NFS_DEF_ACDIRMIN; 930 data->acdirmin = NFS_DEF_ACDIRMIN;
@@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1189 return rc; 1191 return rc;
1190} 1192}
1191 1193
1194static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option,
1195 unsigned long l_bound, unsigned long u_bound)
1196{
1197 int ret;
1198
1199 ret = nfs_get_option_ul(args, option);
1200 if (ret != 0)
1201 return ret;
1202 if (*option < l_bound || *option > u_bound)
1203 return -ERANGE;
1204 return 0;
1205}
1206
1192/* 1207/*
1193 * Error-check and convert a string of mount options from user space into 1208 * Error-check and convert a string of mount options from user space into
1194 * a data structure. The whole mount string is processed; bad options are 1209 * a data structure. The whole mount string is processed; bad options are
@@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw,
1352 mnt->bsize = option; 1367 mnt->bsize = option;
1353 break; 1368 break;
1354 case Opt_timeo: 1369 case Opt_timeo:
1355 if (nfs_get_option_ul(args, &option) || option == 0) 1370 if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX))
1356 goto out_invalid_value; 1371 goto out_invalid_value;
1357 mnt->timeo = option; 1372 mnt->timeo = option;
1358 break; 1373 break;
1359 case Opt_retrans: 1374 case Opt_retrans:
1360 if (nfs_get_option_ul(args, &option) || option == 0) 1375 if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX))
1361 goto out_invalid_value; 1376 goto out_invalid_value;
1362 mnt->retrans = option; 1377 mnt->retrans = option;
1363 break; 1378 break;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8410ca275db1..a204d7e109d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4903 return nfs_ok; 4903 return nfs_ok;
4904} 4904}
4905 4905
4906static __be32
4907nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
4908{
4909 struct nfs4_ol_stateid *stp = openlockstateid(s);
4910 __be32 ret;
4911
4912 mutex_lock(&stp->st_mutex);
4913
4914 ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
4915 if (ret)
4916 goto out;
4917
4918 ret = nfserr_locks_held;
4919 if (check_for_locks(stp->st_stid.sc_file,
4920 lockowner(stp->st_stateowner)))
4921 goto out;
4922
4923 release_lock_stateid(stp);
4924 ret = nfs_ok;
4925
4926out:
4927 mutex_unlock(&stp->st_mutex);
4928 nfs4_put_stid(s);
4929 return ret;
4930}
4931
4906__be32 4932__be32
4907nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 4933nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4908 struct nfsd4_free_stateid *free_stateid) 4934 struct nfsd4_free_stateid *free_stateid)
@@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4910 stateid_t *stateid = &free_stateid->fr_stateid; 4936 stateid_t *stateid = &free_stateid->fr_stateid;
4911 struct nfs4_stid *s; 4937 struct nfs4_stid *s;
4912 struct nfs4_delegation *dp; 4938 struct nfs4_delegation *dp;
4913 struct nfs4_ol_stateid *stp;
4914 struct nfs4_client *cl = cstate->session->se_client; 4939 struct nfs4_client *cl = cstate->session->se_client;
4915 __be32 ret = nfserr_bad_stateid; 4940 __be32 ret = nfserr_bad_stateid;
4916 4941
@@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4929 ret = nfserr_locks_held; 4954 ret = nfserr_locks_held;
4930 break; 4955 break;
4931 case NFS4_LOCK_STID: 4956 case NFS4_LOCK_STID:
4932 ret = check_stateid_generation(stateid, &s->sc_stateid, 1); 4957 atomic_inc(&s->sc_count);
4933 if (ret)
4934 break;
4935 stp = openlockstateid(s);
4936 ret = nfserr_locks_held;
4937 if (check_for_locks(stp->st_stid.sc_file,
4938 lockowner(stp->st_stateowner)))
4939 break;
4940 WARN_ON(!unhash_lock_stateid(stp));
4941 spin_unlock(&cl->cl_lock); 4958 spin_unlock(&cl->cl_lock);
4942 nfs4_put_stid(s); 4959 ret = nfsd4_free_lock_stateid(stateid, s);
4943 ret = nfs_ok;
4944 goto out; 4960 goto out;
4945 case NFS4_REVOKED_DELEG_STID: 4961 case NFS4_REVOKED_DELEG_STID:
4946 dp = delegstateid(s); 4962 dp = delegstateid(s);
@@ -5507,7 +5523,7 @@ static __be32
5507lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, 5523lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
5508 struct nfs4_ol_stateid *ost, 5524 struct nfs4_ol_stateid *ost,
5509 struct nfsd4_lock *lock, 5525 struct nfsd4_lock *lock,
5510 struct nfs4_ol_stateid **lst, bool *new) 5526 struct nfs4_ol_stateid **plst, bool *new)
5511{ 5527{
5512 __be32 status; 5528 __be32 status;
5513 struct nfs4_file *fi = ost->st_stid.sc_file; 5529 struct nfs4_file *fi = ost->st_stid.sc_file;
@@ -5515,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
5515 struct nfs4_client *cl = oo->oo_owner.so_client; 5531 struct nfs4_client *cl = oo->oo_owner.so_client;
5516 struct inode *inode = d_inode(cstate->current_fh.fh_dentry); 5532 struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
5517 struct nfs4_lockowner *lo; 5533 struct nfs4_lockowner *lo;
5534 struct nfs4_ol_stateid *lst;
5518 unsigned int strhashval; 5535 unsigned int strhashval;
5536 bool hashed;
5519 5537
5520 lo = find_lockowner_str(cl, &lock->lk_new_owner); 5538 lo = find_lockowner_str(cl, &lock->lk_new_owner);
5521 if (!lo) { 5539 if (!lo) {
@@ -5531,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
5531 goto out; 5549 goto out;
5532 } 5550 }
5533 5551
5534 *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); 5552retry:
5535 if (*lst == NULL) { 5553 lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
5554 if (lst == NULL) {
5536 status = nfserr_jukebox; 5555 status = nfserr_jukebox;
5537 goto out; 5556 goto out;
5538 } 5557 }
5558
5559 mutex_lock(&lst->st_mutex);
5560
5561 /* See if it's still hashed to avoid race with FREE_STATEID */
5562 spin_lock(&cl->cl_lock);
5563 hashed = !list_empty(&lst->st_perfile);
5564 spin_unlock(&cl->cl_lock);
5565
5566 if (!hashed) {
5567 mutex_unlock(&lst->st_mutex);
5568 nfs4_put_stid(&lst->st_stid);
5569 goto retry;
5570 }
5539 status = nfs_ok; 5571 status = nfs_ok;
5572 *plst = lst;
5540out: 5573out:
5541 nfs4_put_stateowner(&lo->lo_owner); 5574 nfs4_put_stateowner(&lo->lo_owner);
5542 return status; 5575 return status;
@@ -5603,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
5603 goto out; 5636 goto out;
5604 status = lookup_or_create_lock_state(cstate, open_stp, lock, 5637 status = lookup_or_create_lock_state(cstate, open_stp, lock,
5605 &lock_stp, &new); 5638 &lock_stp, &new);
5606 if (status == nfs_ok)
5607 mutex_lock(&lock_stp->st_mutex);
5608 } else { 5639 } else {
5609 status = nfs4_preprocess_seqid_op(cstate, 5640 status = nfs4_preprocess_seqid_op(cstate,
5610 lock->lk_old_lock_seqid, 5641 lock->lk_old_lock_seqid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ba944123167b..ff476e654b8f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1252 if (IS_ERR(dchild)) 1252 if (IS_ERR(dchild))
1253 return nfserrno(host_err); 1253 return nfserrno(host_err);
1254 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1254 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1255 if (err) { 1255 /*
1256 dput(dchild); 1256 * We unconditionally drop our ref to dchild as fh_compose will have
1257 * already grabbed its own ref for it.
1258 */
1259 dput(dchild);
1260 if (err)
1257 return err; 1261 return err;
1258 }
1259 return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type, 1262 return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
1260 rdev, resfhp); 1263 rdev, resfhp);
1261} 1264}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
67 67
68 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 68 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
69 69
70 wait_event(group->fanotify_data.access_waitq, event->response || 70 wait_event(group->fanotify_data.access_waitq, event->response);
71 atomic_read(&group->fanotify_data.bypass_perm));
72
73 if (!event->response) { /* bypass_perm set */
74 /*
75 * Event was canceled because group is being destroyed. Remove
76 * it from group's event list because we are responsible for
77 * freeing the permission event.
78 */
79 fsnotify_remove_event(group, &event->fae.fse);
80 return 0;
81 }
82 71
83 /* userspace responded, convert to something usable */ 72 /* userspace responded, convert to something usable */
84 switch (event->response) { 73 switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
358 358
359#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 359#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
360 struct fanotify_perm_event_info *event, *next; 360 struct fanotify_perm_event_info *event, *next;
361 struct fsnotify_event *fsn_event;
361 362
362 /* 363 /*
363 * There may be still new events arriving in the notification queue 364 * Stop new events from arriving in the notification queue. since
364 * but since userspace cannot use fanotify fd anymore, no event can 365 * userspace cannot use fanotify fd anymore, no event can enter or
365 * enter or leave access_list by now. 366 * leave access_list by now either.
366 */ 367 */
367 spin_lock(&group->fanotify_data.access_lock); 368 fsnotify_group_stop_queueing(group);
368
369 atomic_inc(&group->fanotify_data.bypass_perm);
370 369
370 /*
371 * Process all permission events on access_list and notification queue
372 * and simulate reply from userspace.
373 */
374 spin_lock(&group->fanotify_data.access_lock);
371 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list, 375 list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
372 fae.fse.list) { 376 fae.fse.list) {
373 pr_debug("%s: found group=%p event=%p\n", __func__, group, 377 pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
379 spin_unlock(&group->fanotify_data.access_lock); 383 spin_unlock(&group->fanotify_data.access_lock);
380 384
381 /* 385 /*
382 * Since bypass_perm is set, newly queued events will not wait for 386 * Destroy all non-permission events. For permission events just
383 * access response. Wake up the already sleeping ones now. 387 * dequeue them and set the response. They will be freed once the
384 * synchronize_srcu() in fsnotify_destroy_group() will wait for all 388 * response is consumed and fanotify_get_response() returns.
385 * processes sleeping in fanotify_handle_event() waiting for access
386 * response and thus also for all permission events to be freed.
387 */ 389 */
390 mutex_lock(&group->notification_mutex);
391 while (!fsnotify_notify_queue_is_empty(group)) {
392 fsn_event = fsnotify_remove_first_event(group);
393 if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
394 fsnotify_destroy_event(group, fsn_event);
395 else
396 FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
397 }
398 mutex_unlock(&group->notification_mutex);
399
400 /* Response for all permission events it set, wakeup waiters */
388 wake_up(&group->fanotify_data.access_waitq); 401 wake_up(&group->fanotify_data.access_waitq);
389#endif 402#endif
390 403
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
755 spin_lock_init(&group->fanotify_data.access_lock); 768 spin_lock_init(&group->fanotify_data.access_lock);
756 init_waitqueue_head(&group->fanotify_data.access_waitq); 769 init_waitqueue_head(&group->fanotify_data.access_waitq);
757 INIT_LIST_HEAD(&group->fanotify_data.access_list); 770 INIT_LIST_HEAD(&group->fanotify_data.access_list);
758 atomic_set(&group->fanotify_data.bypass_perm, 0);
759#endif 771#endif
760 switch (flags & FAN_ALL_CLASS_BITS) { 772 switch (flags & FAN_ALL_CLASS_BITS) {
761 case FAN_CLASS_NOTIF: 773 case FAN_CLASS_NOTIF:
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
40} 40}
41 41
42/* 42/*
43 * Stop queueing new events for this group. Once this function returns
44 * fsnotify_add_event() will not add any new events to the group's queue.
45 */
46void fsnotify_group_stop_queueing(struct fsnotify_group *group)
47{
48 mutex_lock(&group->notification_mutex);
49 group->shutdown = true;
50 mutex_unlock(&group->notification_mutex);
51}
52
53/*
43 * Trying to get rid of a group. Remove all marks, flush all events and release 54 * Trying to get rid of a group. Remove all marks, flush all events and release
44 * the group reference. 55 * the group reference.
45 * Note that another thread calling fsnotify_clear_marks_by_group() may still 56 * Note that another thread calling fsnotify_clear_marks_by_group() may still
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
47 */ 58 */
48void fsnotify_destroy_group(struct fsnotify_group *group) 59void fsnotify_destroy_group(struct fsnotify_group *group)
49{ 60{
61 /*
62 * Stop queueing new events. The code below is careful enough to not
63 * require this but fanotify needs to stop queuing events even before
64 * fsnotify_destroy_group() is called and this makes the other callers
65 * of fsnotify_destroy_group() to see the same behavior.
66 */
67 fsnotify_group_stop_queueing(group);
68
50 /* clear all inode marks for this group, attach them to destroy_list */ 69 /* clear all inode marks for this group, attach them to destroy_list */
51 fsnotify_detach_group_marks(group); 70 fsnotify_detach_group_marks(group);
52 71
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
82 * Add an event to the group notification queue. The group can later pull this 82 * Add an event to the group notification queue. The group can later pull this
83 * event off the queue to deal with. The function returns 0 if the event was 83 * event off the queue to deal with. The function returns 0 if the event was
84 * added to the queue, 1 if the event was merged with some other queued event, 84 * added to the queue, 1 if the event was merged with some other queued event,
85 * 2 if the queue of events has overflown. 85 * 2 if the event was not queued - either the queue of events has overflown
86 * or the group is shutting down.
86 */ 87 */
87int fsnotify_add_event(struct fsnotify_group *group, 88int fsnotify_add_event(struct fsnotify_group *group,
88 struct fsnotify_event *event, 89 struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
96 97
97 mutex_lock(&group->notification_mutex); 98 mutex_lock(&group->notification_mutex);
98 99
100 if (group->shutdown) {
101 mutex_unlock(&group->notification_mutex);
102 return 2;
103 }
104
99 if (group->q_len >= group->max_events) { 105 if (group->q_len >= group->max_events) {
100 ret = 2; 106 ret = 2;
101 /* Queue overflow event only if it isn't already queued */ 107 /* Queue overflow event only if it isn't already queued */
@@ -126,21 +132,6 @@ queue:
126} 132}
127 133
128/* 134/*
129 * Remove @event from group's notification queue. It is the responsibility of
130 * the caller to destroy the event.
131 */
132void fsnotify_remove_event(struct fsnotify_group *group,
133 struct fsnotify_event *event)
134{
135 mutex_lock(&group->notification_mutex);
136 if (!list_empty(&event->list)) {
137 list_del_init(&event->list);
138 group->q_len--;
139 }
140 mutex_unlock(&group->notification_mutex);
141}
142
143/*
144 * Remove and return the first event from the notification list. It is the 135 * Remove and return the first event from the notification list. It is the
145 * responsibility of the caller to destroy the obtained event 136 * responsibility of the caller to destroy the obtained event
146 */ 137 */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7dabbc31060e..f165f867f332 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5922,7 +5922,6 @@ bail:
5922} 5922}
5923 5923
5924static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5924static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5925 handle_t *handle,
5926 struct inode *data_alloc_inode, 5925 struct inode *data_alloc_inode,
5927 struct buffer_head *data_alloc_bh) 5926 struct buffer_head *data_alloc_bh)
5928{ 5927{
@@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5935 struct ocfs2_truncate_log *tl; 5934 struct ocfs2_truncate_log *tl;
5936 struct inode *tl_inode = osb->osb_tl_inode; 5935 struct inode *tl_inode = osb->osb_tl_inode;
5937 struct buffer_head *tl_bh = osb->osb_tl_bh; 5936 struct buffer_head *tl_bh = osb->osb_tl_bh;
5937 handle_t *handle;
5938 5938
5939 di = (struct ocfs2_dinode *) tl_bh->b_data; 5939 di = (struct ocfs2_dinode *) tl_bh->b_data;
5940 tl = &di->id2.i_dealloc; 5940 tl = &di->id2.i_dealloc;
5941 i = le16_to_cpu(tl->tl_used) - 1; 5941 i = le16_to_cpu(tl->tl_used) - 1;
5942 while (i >= 0) { 5942 while (i >= 0) {
5943 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5944 if (IS_ERR(handle)) {
5945 status = PTR_ERR(handle);
5946 mlog_errno(status);
5947 goto bail;
5948 }
5949
5943 /* Caller has given us at least enough credits to 5950 /* Caller has given us at least enough credits to
5944 * update the truncate log dinode */ 5951 * update the truncate log dinode */
5945 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, 5952 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
@@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5974 } 5981 }
5975 } 5982 }
5976 5983
5977 status = ocfs2_extend_trans(handle, 5984 ocfs2_commit_trans(osb, handle);
5978 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5979 if (status < 0) {
5980 mlog_errno(status);
5981 goto bail;
5982 }
5983 i--; 5985 i--;
5984 } 5986 }
5985 5987
@@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5994{ 5996{
5995 int status; 5997 int status;
5996 unsigned int num_to_flush; 5998 unsigned int num_to_flush;
5997 handle_t *handle;
5998 struct inode *tl_inode = osb->osb_tl_inode; 5999 struct inode *tl_inode = osb->osb_tl_inode;
5999 struct inode *data_alloc_inode = NULL; 6000 struct inode *data_alloc_inode = NULL;
6000 struct buffer_head *tl_bh = osb->osb_tl_bh; 6001 struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6038 goto out_mutex; 6039 goto out_mutex;
6039 } 6040 }
6040 6041
6041 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 6042 status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
6042 if (IS_ERR(handle)) {
6043 status = PTR_ERR(handle);
6044 mlog_errno(status);
6045 goto out_unlock;
6046 }
6047
6048 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
6049 data_alloc_bh); 6043 data_alloc_bh);
6050 if (status < 0) 6044 if (status < 0)
6051 mlog_errno(status); 6045 mlog_errno(status);
6052 6046
6053 ocfs2_commit_trans(osb, handle);
6054
6055out_unlock:
6056 brelse(data_alloc_bh); 6047 brelse(data_alloc_bh);
6057 ocfs2_inode_unlock(data_alloc_inode, 1); 6048 ocfs2_inode_unlock(data_alloc_inode, 1);
6058 6049
@@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6413 goto out_mutex; 6404 goto out_mutex;
6414 } 6405 }
6415 6406
6416 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6417 if (IS_ERR(handle)) {
6418 ret = PTR_ERR(handle);
6419 mlog_errno(ret);
6420 goto out_unlock;
6421 }
6422
6423 while (head) { 6407 while (head) {
6424 if (head->free_bg) 6408 if (head->free_bg)
6425 bg_blkno = head->free_bg; 6409 bg_blkno = head->free_bg;
6426 else 6410 else
6427 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6411 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6428 head->free_bit); 6412 head->free_bit);
6413 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6414 if (IS_ERR(handle)) {
6415 ret = PTR_ERR(handle);
6416 mlog_errno(ret);
6417 goto out_unlock;
6418 }
6419
6429 trace_ocfs2_free_cached_blocks( 6420 trace_ocfs2_free_cached_blocks(
6430 (unsigned long long)head->free_blk, head->free_bit); 6421 (unsigned long long)head->free_blk, head->free_bit);
6431 6422
6432 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 6423 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6433 head->free_bit, bg_blkno, 1); 6424 head->free_bit, bg_blkno, 1);
6434 if (ret) { 6425 if (ret)
6435 mlog_errno(ret); 6426 mlog_errno(ret);
6436 goto out_journal;
6437 }
6438 6427
6439 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 6428 ocfs2_commit_trans(osb, handle);
6440 if (ret) {
6441 mlog_errno(ret);
6442 goto out_journal;
6443 }
6444 6429
6445 tmp = head; 6430 tmp = head;
6446 head = head->free_next; 6431 head = head->free_next;
6447 kfree(tmp); 6432 kfree(tmp);
6448 } 6433 }
6449 6434
6450out_journal:
6451 ocfs2_commit_trans(osb, handle);
6452
6453out_unlock: 6435out_unlock:
6454 ocfs2_inode_unlock(inode, 1); 6436 ocfs2_inode_unlock(inode, 1);
6455 brelse(di_bh); 6437 brelse(di_bh);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 94b18369b1cc..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,9 +44,6 @@
44 * version here in tcp_internal.h should not need to be bumped for 44 * version here in tcp_internal.h should not need to be bumped for
45 * filesystem locking changes. 45 * filesystem locking changes.
46 * 46 *
47 * New in version 12
48 * - Negotiate hb timeout when storage is down.
49 *
50 * New in version 11 47 * New in version 11
51 * - Negotiation of filesystem locking in the dlm join. 48 * - Negotiation of filesystem locking in the dlm join.
52 * 49 *
@@ -78,7 +75,7 @@
78 * - full 64 bit i_size in the metadata lock lvbs 75 * - full 64 bit i_size in the metadata lock lvbs
79 * - introduction of "rw" lock and pushing meta/data locking down 76 * - introduction of "rw" lock and pushing meta/data locking down
80 */ 77 */
81#define O2NET_PROTOCOL_VERSION 12ULL 78#define O2NET_PROTOCOL_VERSION 11ULL
82struct o2net_handshake { 79struct o2net_handshake {
83 __be64 protocol_version; 80 __be64 protocol_version;
84 __be64 connector_id; 81 __be64 connector_id;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index cdeafb4e7ed6..0bb128659d4b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
268 struct dlm_lock *lock, int flags, int type) 268 struct dlm_lock *lock, int flags, int type)
269{ 269{
270 enum dlm_status status; 270 enum dlm_status status;
271 u8 old_owner = res->owner;
272 271
273 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, 272 mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
274 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); 273 lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
335 334
336 spin_lock(&res->spinlock); 335 spin_lock(&res->spinlock);
337 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 336 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
338 lock->convert_pending = 0;
339 /* if it failed, move it back to granted queue. 337 /* if it failed, move it back to granted queue.
340 * if master returns DLM_NORMAL and then down before sending ast, 338 * if master returns DLM_NORMAL and then down before sending ast,
341 * it may have already been moved to granted queue, reset to 339 * it may have already been moved to granted queue, reset to
@@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
344 if (status != DLM_NOTQUEUED) 342 if (status != DLM_NOTQUEUED)
345 dlm_error(status); 343 dlm_error(status);
346 dlm_revert_pending_convert(res, lock); 344 dlm_revert_pending_convert(res, lock);
347 } else if ((res->state & DLM_LOCK_RES_RECOVERING) || 345 } else if (!lock->convert_pending) {
348 (old_owner != res->owner)) { 346 mlog(0, "%s: res %.*s, owner died and lock has been moved back "
349 mlog(0, "res %.*s is in recovering or has been recovered.\n", 347 "to granted list, retry convert.\n",
350 res->lockname.len, res->lockname.name); 348 dlm->name, res->lockname.len, res->lockname.name);
351 status = DLM_RECOVERING; 349 status = DLM_RECOVERING;
352 } 350 }
351
352 lock->convert_pending = 0;
353bail: 353bail:
354 spin_unlock(&res->spinlock); 354 spin_unlock(&res->spinlock);
355 355
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4e7b0dc22450..0b055bfb8e86 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1506 u64 start, u64 len) 1506 u64 start, u64 len)
1507{ 1507{
1508 int ret = 0; 1508 int ret = 0;
1509 u64 tmpend, end = start + len; 1509 u64 tmpend = 0;
1510 u64 end = start + len;
1510 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1511 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1511 unsigned int csize = osb->s_clustersize; 1512 unsigned int csize = osb->s_clustersize;
1512 handle_t *handle; 1513 handle_t *handle;
@@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1538 } 1539 }
1539 1540
1540 /* 1541 /*
1541 * We want to get the byte offset of the end of the 1st cluster. 1542 * If start is on a cluster boundary and end is somewhere in another
1543 * cluster, we have not COWed the cluster starting at start, unless
1544 * end is also within the same cluster. So, in this case, we skip this
1545 * first call to ocfs2_zero_range_for_truncate() truncate and move on
1546 * to the next one.
1542 */ 1547 */
1543 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); 1548 if ((start & (csize - 1)) != 0) {
1544 if (tmpend > end) 1549 /*
1545 tmpend = end; 1550 * We want to get the byte offset of the end of the 1st
1551 * cluster.
1552 */
1553 tmpend = (u64)osb->s_clustersize +
1554 (start & ~(osb->s_clustersize - 1));
1555 if (tmpend > end)
1556 tmpend = end;
1546 1557
1547 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start, 1558 trace_ocfs2_zero_partial_clusters_range1(
1548 (unsigned long long)tmpend); 1559 (unsigned long long)start,
1560 (unsigned long long)tmpend);
1549 1561
1550 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1562 ret = ocfs2_zero_range_for_truncate(inode, handle, start,
1551 if (ret) 1563 tmpend);
1552 mlog_errno(ret); 1564 if (ret)
1565 mlog_errno(ret);
1566 }
1553 1567
1554 if (tmpend < end) { 1568 if (tmpend < end) {
1555 /* 1569 /*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ea47120a85ff..6ad3533940ba 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1199,14 +1199,24 @@ retry:
1199 inode_unlock((*ac)->ac_inode); 1199 inode_unlock((*ac)->ac_inode);
1200 1200
1201 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted); 1201 ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
1202 if (ret == 1) 1202 if (ret == 1) {
1203 iput((*ac)->ac_inode);
1204 (*ac)->ac_inode = NULL;
1203 goto retry; 1205 goto retry;
1206 }
1204 1207
1205 if (ret < 0) 1208 if (ret < 0)
1206 mlog_errno(ret); 1209 mlog_errno(ret);
1207 1210
1208 inode_lock((*ac)->ac_inode); 1211 inode_lock((*ac)->ac_inode);
1209 ocfs2_inode_lock((*ac)->ac_inode, NULL, 1); 1212 ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
1213 if (ret < 0) {
1214 mlog_errno(ret);
1215 inode_unlock((*ac)->ac_inode);
1216 iput((*ac)->ac_inode);
1217 (*ac)->ac_inode = NULL;
1218 goto bail;
1219 }
1210 } 1220 }
1211 if (status < 0) { 1221 if (status < 0) {
1212 if (status != -ENOSPC) 1222 if (status != -ENOSPC)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 54e5d6681786..43fdc2765aea 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
80 } 80 }
81 81
82 for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { 82 for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
83 if (ovl_is_private_xattr(name))
84 continue;
83retry: 85retry:
84 size = vfs_getxattr(old, name, value, value_size); 86 size = vfs_getxattr(old, name, value, value_size);
85 if (size == -ERANGE) 87 if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 12bcd07b9e32..1560fdc09a5f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -12,6 +12,8 @@
12#include <linux/xattr.h> 12#include <linux/xattr.h>
13#include <linux/security.h> 13#include <linux/security.h>
14#include <linux/cred.h> 14#include <linux/cred.h>
15#include <linux/posix_acl.h>
16#include <linux/posix_acl_xattr.h>
15#include "overlayfs.h" 17#include "overlayfs.h"
16 18
17void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) 19void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
@@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
186 struct dentry *newdentry; 188 struct dentry *newdentry;
187 int err; 189 int err;
188 190
191 if (!hardlink && !IS_POSIXACL(udir))
192 stat->mode &= ~current_umask();
193
189 inode_lock_nested(udir, I_MUTEX_PARENT); 194 inode_lock_nested(udir, I_MUTEX_PARENT);
190 newdentry = lookup_one_len(dentry->d_name.name, upperdir, 195 newdentry = lookup_one_len(dentry->d_name.name, upperdir,
191 dentry->d_name.len); 196 dentry->d_name.len);
@@ -335,6 +340,32 @@ out_free:
335 return ret; 340 return ret;
336} 341}
337 342
343static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
344 const struct posix_acl *acl)
345{
346 void *buffer;
347 size_t size;
348 int err;
349
350 if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
351 return 0;
352
353 size = posix_acl_to_xattr(NULL, acl, NULL, 0);
354 buffer = kmalloc(size, GFP_KERNEL);
355 if (!buffer)
356 return -ENOMEM;
357
358 size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
359 err = size;
360 if (err < 0)
361 goto out_free;
362
363 err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
364out_free:
365 kfree(buffer);
366 return err;
367}
368
338static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, 369static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
339 struct kstat *stat, const char *link, 370 struct kstat *stat, const char *link,
340 struct dentry *hardlink) 371 struct dentry *hardlink)
@@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
346 struct dentry *upper; 377 struct dentry *upper;
347 struct dentry *newdentry; 378 struct dentry *newdentry;
348 int err; 379 int err;
380 struct posix_acl *acl, *default_acl;
349 381
350 if (WARN_ON(!workdir)) 382 if (WARN_ON(!workdir))
351 return -EROFS; 383 return -EROFS;
352 384
385 if (!hardlink) {
386 err = posix_acl_create(dentry->d_parent->d_inode,
387 &stat->mode, &default_acl, &acl);
388 if (err)
389 return err;
390 }
391
353 err = ovl_lock_rename_workdir(workdir, upperdir); 392 err = ovl_lock_rename_workdir(workdir, upperdir);
354 if (err) 393 if (err)
355 goto out; 394 goto out;
@@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
384 if (err) 423 if (err)
385 goto out_cleanup; 424 goto out_cleanup;
386 } 425 }
426 if (!hardlink) {
427 err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS,
428 acl);
429 if (err)
430 goto out_cleanup;
431
432 err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT,
433 default_acl);
434 if (err)
435 goto out_cleanup;
436 }
387 437
388 if (!hardlink && S_ISDIR(stat->mode)) { 438 if (!hardlink && S_ISDIR(stat->mode)) {
389 err = ovl_set_opaque(newdentry); 439 err = ovl_set_opaque(newdentry);
@@ -410,6 +460,10 @@ out_dput:
410out_unlock: 460out_unlock:
411 unlock_rename(workdir, upperdir); 461 unlock_rename(workdir, upperdir);
412out: 462out:
463 if (!hardlink) {
464 posix_acl_release(acl);
465 posix_acl_release(default_acl);
466 }
413 return err; 467 return err;
414 468
415out_cleanup: 469out_cleanup:
@@ -950,9 +1004,9 @@ const struct inode_operations ovl_dir_inode_operations = {
950 .permission = ovl_permission, 1004 .permission = ovl_permission,
951 .getattr = ovl_dir_getattr, 1005 .getattr = ovl_dir_getattr,
952 .setxattr = generic_setxattr, 1006 .setxattr = generic_setxattr,
953 .getxattr = ovl_getxattr, 1007 .getxattr = generic_getxattr,
954 .listxattr = ovl_listxattr, 1008 .listxattr = ovl_listxattr,
955 .removexattr = ovl_removexattr, 1009 .removexattr = generic_removexattr,
956 .get_acl = ovl_get_acl, 1010 .get_acl = ovl_get_acl,
957 .update_time = ovl_update_time, 1011 .update_time = ovl_update_time,
958}; 1012};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1b885c156028..c75625c1efa3 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -10,6 +10,7 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/xattr.h> 12#include <linux/xattr.h>
13#include <linux/posix_acl.h>
13#include "overlayfs.h" 14#include "overlayfs.h"
14 15
15static int ovl_copy_up_truncate(struct dentry *dentry) 16static int ovl_copy_up_truncate(struct dentry *dentry)
@@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
191 return err; 192 return err;
192} 193}
193 194
194static bool ovl_is_private_xattr(const char *name) 195bool ovl_is_private_xattr(const char *name)
195{ 196{
196#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "." 197 return strncmp(name, OVL_XATTR_PREFIX,
197 return strncmp(name, OVL_XATTR_PRE_NAME, 198 sizeof(OVL_XATTR_PREFIX) - 1) == 0;
198 sizeof(OVL_XATTR_PRE_NAME) - 1) == 0;
199} 199}
200 200
201int ovl_setxattr(struct dentry *dentry, struct inode *inode, 201int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
202 const char *name, const void *value, 202 size_t size, int flags)
203 size_t size, int flags)
204{ 203{
205 int err; 204 int err;
206 struct dentry *upperdentry; 205 struct path realpath;
206 enum ovl_path_type type = ovl_path_real(dentry, &realpath);
207 const struct cred *old_cred; 207 const struct cred *old_cred;
208 208
209 err = ovl_want_write(dentry); 209 err = ovl_want_write(dentry);
210 if (err) 210 if (err)
211 goto out; 211 goto out;
212 212
213 if (!value && !OVL_TYPE_UPPER(type)) {
214 err = vfs_getxattr(realpath.dentry, name, NULL, 0);
215 if (err < 0)
216 goto out_drop_write;
217 }
218
213 err = ovl_copy_up(dentry); 219 err = ovl_copy_up(dentry);
214 if (err) 220 if (err)
215 goto out_drop_write; 221 goto out_drop_write;
216 222
217 upperdentry = ovl_dentry_upper(dentry); 223 if (!OVL_TYPE_UPPER(type))
224 ovl_path_upper(dentry, &realpath);
225
218 old_cred = ovl_override_creds(dentry->d_sb); 226 old_cred = ovl_override_creds(dentry->d_sb);
219 err = vfs_setxattr(upperdentry, name, value, size, flags); 227 if (value)
228 err = vfs_setxattr(realpath.dentry, name, value, size, flags);
229 else {
230 WARN_ON(flags != XATTR_REPLACE);
231 err = vfs_removexattr(realpath.dentry, name);
232 }
220 revert_creds(old_cred); 233 revert_creds(old_cred);
221 234
222out_drop_write: 235out_drop_write:
@@ -225,16 +238,13 @@ out:
225 return err; 238 return err;
226} 239}
227 240
228ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, 241int ovl_xattr_get(struct dentry *dentry, const char *name,
229 const char *name, void *value, size_t size) 242 void *value, size_t size)
230{ 243{
231 struct dentry *realdentry = ovl_dentry_real(dentry); 244 struct dentry *realdentry = ovl_dentry_real(dentry);
232 ssize_t res; 245 ssize_t res;
233 const struct cred *old_cred; 246 const struct cred *old_cred;
234 247
235 if (ovl_is_private_xattr(name))
236 return -ENODATA;
237
238 old_cred = ovl_override_creds(dentry->d_sb); 248 old_cred = ovl_override_creds(dentry->d_sb);
239 res = vfs_getxattr(realdentry, name, value, size); 249 res = vfs_getxattr(realdentry, name, value, size);
240 revert_creds(old_cred); 250 revert_creds(old_cred);
@@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
245{ 255{
246 struct dentry *realdentry = ovl_dentry_real(dentry); 256 struct dentry *realdentry = ovl_dentry_real(dentry);
247 ssize_t res; 257 ssize_t res;
248 int off; 258 size_t len;
259 char *s;
249 const struct cred *old_cred; 260 const struct cred *old_cred;
250 261
251 old_cred = ovl_override_creds(dentry->d_sb); 262 old_cred = ovl_override_creds(dentry->d_sb);
@@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
255 return res; 266 return res;
256 267
257 /* filter out private xattrs */ 268 /* filter out private xattrs */
258 for (off = 0; off < res;) { 269 for (s = list, len = res; len;) {
259 char *s = list + off; 270 size_t slen = strnlen(s, len) + 1;
260 size_t slen = strlen(s) + 1;
261 271
262 BUG_ON(off + slen > res); 272 /* underlying fs providing us with an broken xattr list? */
273 if (WARN_ON(slen > len))
274 return -EIO;
263 275
276 len -= slen;
264 if (ovl_is_private_xattr(s)) { 277 if (ovl_is_private_xattr(s)) {
265 res -= slen; 278 res -= slen;
266 memmove(s, s + slen, res - off); 279 memmove(s, s + slen, len);
267 } else { 280 } else {
268 off += slen; 281 s += slen;
269 } 282 }
270 } 283 }
271 284
272 return res; 285 return res;
273} 286}
274 287
275int ovl_removexattr(struct dentry *dentry, const char *name)
276{
277 int err;
278 struct path realpath;
279 enum ovl_path_type type = ovl_path_real(dentry, &realpath);
280 const struct cred *old_cred;
281
282 err = ovl_want_write(dentry);
283 if (err)
284 goto out;
285
286 err = -ENODATA;
287 if (ovl_is_private_xattr(name))
288 goto out_drop_write;
289
290 if (!OVL_TYPE_UPPER(type)) {
291 err = vfs_getxattr(realpath.dentry, name, NULL, 0);
292 if (err < 0)
293 goto out_drop_write;
294
295 err = ovl_copy_up(dentry);
296 if (err)
297 goto out_drop_write;
298
299 ovl_path_upper(dentry, &realpath);
300 }
301
302 old_cred = ovl_override_creds(dentry->d_sb);
303 err = vfs_removexattr(realpath.dentry, name);
304 revert_creds(old_cred);
305out_drop_write:
306 ovl_drop_write(dentry);
307out:
308 return err;
309}
310
311struct posix_acl *ovl_get_acl(struct inode *inode, int type) 288struct posix_acl *ovl_get_acl(struct inode *inode, int type)
312{ 289{
313 struct inode *realinode = ovl_inode_real(inode, NULL); 290 struct inode *realinode = ovl_inode_real(inode, NULL);
314 const struct cred *old_cred; 291 const struct cred *old_cred;
315 struct posix_acl *acl; 292 struct posix_acl *acl;
316 293
317 if (!IS_POSIXACL(realinode)) 294 if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
318 return NULL; 295 return NULL;
319 296
320 if (!realinode->i_op->get_acl) 297 if (!realinode->i_op->get_acl)
321 return NULL; 298 return NULL;
322 299
323 old_cred = ovl_override_creds(inode->i_sb); 300 old_cred = ovl_override_creds(inode->i_sb);
324 acl = realinode->i_op->get_acl(realinode, type); 301 acl = get_acl(realinode, type);
325 revert_creds(old_cred); 302 revert_creds(old_cred);
326 303
327 return acl; 304 return acl;
@@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = {
391 .permission = ovl_permission, 368 .permission = ovl_permission,
392 .getattr = ovl_getattr, 369 .getattr = ovl_getattr,
393 .setxattr = generic_setxattr, 370 .setxattr = generic_setxattr,
394 .getxattr = ovl_getxattr, 371 .getxattr = generic_getxattr,
395 .listxattr = ovl_listxattr, 372 .listxattr = ovl_listxattr,
396 .removexattr = ovl_removexattr, 373 .removexattr = generic_removexattr,
397 .get_acl = ovl_get_acl, 374 .get_acl = ovl_get_acl,
398 .update_time = ovl_update_time, 375 .update_time = ovl_update_time,
399}; 376};
@@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = {
404 .readlink = ovl_readlink, 381 .readlink = ovl_readlink,
405 .getattr = ovl_getattr, 382 .getattr = ovl_getattr,
406 .setxattr = generic_setxattr, 383 .setxattr = generic_setxattr,
407 .getxattr = ovl_getxattr, 384 .getxattr = generic_getxattr,
408 .listxattr = ovl_listxattr, 385 .listxattr = ovl_listxattr,
409 .removexattr = ovl_removexattr, 386 .removexattr = generic_removexattr,
410 .update_time = ovl_update_time, 387 .update_time = ovl_update_time,
411}; 388};
412 389
@@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
415 inode->i_ino = get_next_ino(); 392 inode->i_ino = get_next_ino();
416 inode->i_mode = mode; 393 inode->i_mode = mode;
417 inode->i_flags |= S_NOCMTIME; 394 inode->i_flags |= S_NOCMTIME;
395#ifdef CONFIG_FS_POSIX_ACL
396 inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
397#endif
418 398
419 mode &= S_IFMT; 399 mode &= S_IFMT;
420 switch (mode) { 400 switch (mode) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e4f5c9536bfe..5813ccff8cd9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -24,8 +24,8 @@ enum ovl_path_type {
24 (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) 24 (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
25 25
26 26
27#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay" 27#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
28#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque" 28#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
29 29
30#define OVL_ISUPPER_MASK 1UL 30#define OVL_ISUPPER_MASK 1UL
31 31
@@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
179void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); 179void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
180void ovl_cache_free(struct list_head *list); 180void ovl_cache_free(struct list_head *list);
181int ovl_check_d_type_supported(struct path *realpath); 181int ovl_check_d_type_supported(struct path *realpath);
182void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
183 struct dentry *dentry, int level);
182 184
183/* inode.c */ 185/* inode.c */
184int ovl_setattr(struct dentry *dentry, struct iattr *attr); 186int ovl_setattr(struct dentry *dentry, struct iattr *attr);
185int ovl_permission(struct inode *inode, int mask); 187int ovl_permission(struct inode *inode, int mask);
186int ovl_setxattr(struct dentry *dentry, struct inode *inode, 188int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
187 const char *name, const void *value, 189 size_t size, int flags);
188 size_t size, int flags); 190int ovl_xattr_get(struct dentry *dentry, const char *name,
189ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, 191 void *value, size_t size);
190 const char *name, void *value, size_t size);
191ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); 192ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
192int ovl_removexattr(struct dentry *dentry, const char *name);
193struct posix_acl *ovl_get_acl(struct inode *inode, int type); 193struct posix_acl *ovl_get_acl(struct inode *inode, int type);
194int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); 194int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
195int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); 195int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
196bool ovl_is_private_xattr(const char *name);
196 197
197struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); 198struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
198struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); 199struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index cf37fc76fc9f..f241b4ee3d8a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath,
248 err = rdd->err; 248 err = rdd->err;
249 } while (!err && rdd->count); 249 } while (!err && rdd->count);
250 250
251 if (!err && rdd->first_maybe_whiteout) 251 if (!err && rdd->first_maybe_whiteout && rdd->dentry)
252 err = ovl_check_whiteouts(realpath->dentry, rdd); 252 err = ovl_check_whiteouts(realpath->dentry, rdd);
253 253
254 fput(realfile); 254 fput(realfile);
@@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath)
606 606
607 return rdd.d_type_supported; 607 return rdd.d_type_supported;
608} 608}
609
610static void ovl_workdir_cleanup_recurse(struct path *path, int level)
611{
612 int err;
613 struct inode *dir = path->dentry->d_inode;
614 LIST_HEAD(list);
615 struct ovl_cache_entry *p;
616 struct ovl_readdir_data rdd = {
617 .ctx.actor = ovl_fill_merge,
618 .dentry = NULL,
619 .list = &list,
620 .root = RB_ROOT,
621 .is_lowest = false,
622 };
623
624 err = ovl_dir_read(path, &rdd);
625 if (err)
626 goto out;
627
628 inode_lock_nested(dir, I_MUTEX_PARENT);
629 list_for_each_entry(p, &list, l_node) {
630 struct dentry *dentry;
631
632 if (p->name[0] == '.') {
633 if (p->len == 1)
634 continue;
635 if (p->len == 2 && p->name[1] == '.')
636 continue;
637 }
638 dentry = lookup_one_len(p->name, path->dentry, p->len);
639 if (IS_ERR(dentry))
640 continue;
641 if (dentry->d_inode)
642 ovl_workdir_cleanup(dir, path->mnt, dentry, level);
643 dput(dentry);
644 }
645 inode_unlock(dir);
646out:
647 ovl_cache_free(&list);
648}
649
650void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
651 struct dentry *dentry, int level)
652{
653 int err;
654
655 if (!d_is_dir(dentry) || level > 1) {
656 ovl_cleanup(dir, dentry);
657 return;
658 }
659
660 err = ovl_do_rmdir(dir, dentry);
661 if (err) {
662 struct path path = { .mnt = mnt, .dentry = dentry };
663
664 inode_unlock(dir);
665 ovl_workdir_cleanup_recurse(&path, level + 1);
666 inode_lock_nested(dir, I_MUTEX_PARENT);
667 ovl_cleanup(dir, dentry);
668 }
669}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4036132842b5..e2a94a26767b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -814,6 +814,10 @@ retry:
814 struct kstat stat = { 814 struct kstat stat = {
815 .mode = S_IFDIR | 0, 815 .mode = S_IFDIR | 0,
816 }; 816 };
817 struct iattr attr = {
818 .ia_valid = ATTR_MODE,
819 .ia_mode = stat.mode,
820 };
817 821
818 if (work->d_inode) { 822 if (work->d_inode) {
819 err = -EEXIST; 823 err = -EEXIST;
@@ -821,7 +825,7 @@ retry:
821 goto out_dput; 825 goto out_dput;
822 826
823 retried = true; 827 retried = true;
824 ovl_cleanup(dir, work); 828 ovl_workdir_cleanup(dir, mnt, work, 0);
825 dput(work); 829 dput(work);
826 goto retry; 830 goto retry;
827 } 831 }
@@ -829,6 +833,21 @@ retry:
829 err = ovl_create_real(dir, work, &stat, NULL, NULL, true); 833 err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
830 if (err) 834 if (err)
831 goto out_dput; 835 goto out_dput;
836
837 err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
838 if (err && err != -ENODATA && err != -EOPNOTSUPP)
839 goto out_dput;
840
841 err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
842 if (err && err != -ENODATA && err != -EOPNOTSUPP)
843 goto out_dput;
844
845 /* Clear any inherited mode bits */
846 inode_lock(work->d_inode);
847 err = notify_change(work, &attr, NULL);
848 inode_unlock(work->d_inode);
849 if (err)
850 goto out_dput;
832 } 851 }
833out_unlock: 852out_unlock:
834 inode_unlock(dir); 853 inode_unlock(dir);
@@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str)
967 return ctr; 986 return ctr;
968} 987}
969 988
970static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler, 989static int __maybe_unused
971 struct dentry *dentry, struct inode *inode, 990ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
972 const char *name, const void *value, 991 struct dentry *dentry, struct inode *inode,
973 size_t size, int flags) 992 const char *name, void *buffer, size_t size)
993{
994 return ovl_xattr_get(dentry, handler->name, buffer, size);
995}
996
997static int __maybe_unused
998ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
999 struct dentry *dentry, struct inode *inode,
1000 const char *name, const void *value,
1001 size_t size, int flags)
974{ 1002{
975 struct dentry *workdir = ovl_workdir(dentry); 1003 struct dentry *workdir = ovl_workdir(dentry);
976 struct inode *realinode = ovl_inode_real(inode, NULL); 1004 struct inode *realinode = ovl_inode_real(inode, NULL);
@@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
998 1026
999 posix_acl_release(acl); 1027 posix_acl_release(acl);
1000 1028
1001 return ovl_setxattr(dentry, inode, handler->name, value, size, flags); 1029 err = ovl_xattr_set(dentry, handler->name, value, size, flags);
1030 if (!err)
1031 ovl_copyattr(ovl_inode_real(inode, NULL), inode);
1032
1033 return err;
1002 1034
1003out_acl_release: 1035out_acl_release:
1004 posix_acl_release(acl); 1036 posix_acl_release(acl);
1005 return err; 1037 return err;
1006} 1038}
1007 1039
1008static int ovl_other_xattr_set(const struct xattr_handler *handler, 1040static int ovl_own_xattr_get(const struct xattr_handler *handler,
1009 struct dentry *dentry, struct inode *inode, 1041 struct dentry *dentry, struct inode *inode,
1010 const char *name, const void *value, 1042 const char *name, void *buffer, size_t size)
1011 size_t size, int flags)
1012{ 1043{
1013 return ovl_setxattr(dentry, inode, name, value, size, flags); 1044 return -EPERM;
1014} 1045}
1015 1046
1016static int ovl_own_xattr_set(const struct xattr_handler *handler, 1047static int ovl_own_xattr_set(const struct xattr_handler *handler,
@@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler,
1021 return -EPERM; 1052 return -EPERM;
1022} 1053}
1023 1054
1024static const struct xattr_handler ovl_posix_acl_access_xattr_handler = { 1055static int ovl_other_xattr_get(const struct xattr_handler *handler,
1056 struct dentry *dentry, struct inode *inode,
1057 const char *name, void *buffer, size_t size)
1058{
1059 return ovl_xattr_get(dentry, name, buffer, size);
1060}
1061
1062static int ovl_other_xattr_set(const struct xattr_handler *handler,
1063 struct dentry *dentry, struct inode *inode,
1064 const char *name, const void *value,
1065 size_t size, int flags)
1066{
1067 return ovl_xattr_set(dentry, name, value, size, flags);
1068}
1069
1070static const struct xattr_handler __maybe_unused
1071ovl_posix_acl_access_xattr_handler = {
1025 .name = XATTR_NAME_POSIX_ACL_ACCESS, 1072 .name = XATTR_NAME_POSIX_ACL_ACCESS,
1026 .flags = ACL_TYPE_ACCESS, 1073 .flags = ACL_TYPE_ACCESS,
1074 .get = ovl_posix_acl_xattr_get,
1027 .set = ovl_posix_acl_xattr_set, 1075 .set = ovl_posix_acl_xattr_set,
1028}; 1076};
1029 1077
1030static const struct xattr_handler ovl_posix_acl_default_xattr_handler = { 1078static const struct xattr_handler __maybe_unused
1079ovl_posix_acl_default_xattr_handler = {
1031 .name = XATTR_NAME_POSIX_ACL_DEFAULT, 1080 .name = XATTR_NAME_POSIX_ACL_DEFAULT,
1032 .flags = ACL_TYPE_DEFAULT, 1081 .flags = ACL_TYPE_DEFAULT,
1082 .get = ovl_posix_acl_xattr_get,
1033 .set = ovl_posix_acl_xattr_set, 1083 .set = ovl_posix_acl_xattr_set,
1034}; 1084};
1035 1085
1036static const struct xattr_handler ovl_own_xattr_handler = { 1086static const struct xattr_handler ovl_own_xattr_handler = {
1037 .prefix = OVL_XATTR_PREFIX, 1087 .prefix = OVL_XATTR_PREFIX,
1088 .get = ovl_own_xattr_get,
1038 .set = ovl_own_xattr_set, 1089 .set = ovl_own_xattr_set,
1039}; 1090};
1040 1091
1041static const struct xattr_handler ovl_other_xattr_handler = { 1092static const struct xattr_handler ovl_other_xattr_handler = {
1042 .prefix = "", /* catch all */ 1093 .prefix = "", /* catch all */
1094 .get = ovl_other_xattr_get,
1043 .set = ovl_other_xattr_set, 1095 .set = ovl_other_xattr_set,
1044}; 1096};
1045 1097
1046static const struct xattr_handler *ovl_xattr_handlers[] = { 1098static const struct xattr_handler *ovl_xattr_handlers[] = {
1099#ifdef CONFIG_FS_POSIX_ACL
1047 &ovl_posix_acl_access_xattr_handler, 1100 &ovl_posix_acl_access_xattr_handler,
1048 &ovl_posix_acl_default_xattr_handler, 1101 &ovl_posix_acl_default_xattr_handler,
1102#endif
1049 &ovl_own_xattr_handler, 1103 &ovl_own_xattr_handler,
1050 &ovl_other_xattr_handler, 1104 &ovl_other_xattr_handler,
1051 NULL 1105 NULL
1052}; 1106};
1053 1107
1054static const struct xattr_handler *ovl_xattr_noacl_handlers[] = {
1055 &ovl_own_xattr_handler,
1056 &ovl_other_xattr_handler,
1057 NULL,
1058};
1059
1060static int ovl_fill_super(struct super_block *sb, void *data, int silent) 1108static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1061{ 1109{
1062 struct path upperpath = { NULL, NULL }; 1110 struct path upperpath = { NULL, NULL };
@@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1132 err = -EINVAL; 1180 err = -EINVAL;
1133 stacklen = ovl_split_lowerdirs(lowertmp); 1181 stacklen = ovl_split_lowerdirs(lowertmp);
1134 if (stacklen > OVL_MAX_STACK) { 1182 if (stacklen > OVL_MAX_STACK) {
1135 pr_err("overlayfs: too many lower directries, limit is %d\n", 1183 pr_err("overlayfs: too many lower directories, limit is %d\n",
1136 OVL_MAX_STACK); 1184 OVL_MAX_STACK);
1137 goto out_free_lowertmp; 1185 goto out_free_lowertmp;
1138 } else if (!ufs->config.upperdir && stacklen == 1) { 1186 } else if (!ufs->config.upperdir && stacklen == 1) {
@@ -1269,10 +1317,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1269 1317
1270 sb->s_magic = OVERLAYFS_SUPER_MAGIC; 1318 sb->s_magic = OVERLAYFS_SUPER_MAGIC;
1271 sb->s_op = &ovl_super_operations; 1319 sb->s_op = &ovl_super_operations;
1272 if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) 1320 sb->s_xattr = ovl_xattr_handlers;
1273 sb->s_xattr = ovl_xattr_handlers;
1274 else
1275 sb->s_xattr = ovl_xattr_noacl_handlers;
1276 sb->s_root = root_dentry; 1321 sb->s_root = root_dentry;
1277 sb->s_fs_info = ufs; 1322 sb->s_fs_info = ufs;
1278 sb->s_flags |= MS_POSIXACL; 1323 sb->s_flags |= MS_POSIXACL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4b32928f5426..4ebe6b2e5217 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
144 struct page *page = buf->page; 144 struct page *page = buf->page;
145 145
146 if (page_count(page) == 1) { 146 if (page_count(page) == 1) {
147 if (memcg_kmem_enabled()) { 147 if (memcg_kmem_enabled())
148 memcg_kmem_uncharge(page, 0); 148 memcg_kmem_uncharge(page, 0);
149 __ClearPageKmemcg(page);
150 }
151 __SetPageLocked(page); 149 __SetPageLocked(page);
152 return 0; 150 return 0;
153 } 151 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 54e270262979..ac0df4dde823 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1556,18 +1556,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
1556static int proc_exe_link(struct dentry *dentry, struct path *exe_path) 1556static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1557{ 1557{
1558 struct task_struct *task; 1558 struct task_struct *task;
1559 struct mm_struct *mm;
1560 struct file *exe_file; 1559 struct file *exe_file;
1561 1560
1562 task = get_proc_task(d_inode(dentry)); 1561 task = get_proc_task(d_inode(dentry));
1563 if (!task) 1562 if (!task)
1564 return -ENOENT; 1563 return -ENOENT;
1565 mm = get_task_mm(task); 1564 exe_file = get_task_exe_file(task);
1566 put_task_struct(task); 1565 put_task_struct(task);
1567 if (!mm)
1568 return -ENOENT;
1569 exe_file = get_mm_exe_file(mm);
1570 mmput(mm);
1571 if (exe_file) { 1566 if (exe_file) {
1572 *exe_path = exe_file->f_path; 1567 *exe_path = exe_file->f_path;
1573 path_get(&exe_file->f_path); 1568 path_get(&exe_file->f_path);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..5c89a07e3d7f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
430static ssize_t 430static ssize_t
431read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) 431read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
432{ 432{
433 char *buf = file->private_data;
433 ssize_t acc = 0; 434 ssize_t acc = 0;
434 size_t size, tsz; 435 size_t size, tsz;
435 size_t elf_buflen; 436 size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
500 if (clear_user(buffer, tsz)) 501 if (clear_user(buffer, tsz))
501 return -EFAULT; 502 return -EFAULT;
502 } else if (is_vmalloc_or_module_addr((void *)start)) { 503 } else if (is_vmalloc_or_module_addr((void *)start)) {
503 char * elf_buf; 504 vread(buf, (char *)start, tsz);
504
505 elf_buf = kzalloc(tsz, GFP_KERNEL);
506 if (!elf_buf)
507 return -ENOMEM;
508 vread(elf_buf, (char *)start, tsz);
509 /* we have to zero-fill user buffer even if no read */ 505 /* we have to zero-fill user buffer even if no read */
510 if (copy_to_user(buffer, elf_buf, tsz)) { 506 if (copy_to_user(buffer, buf, tsz))
511 kfree(elf_buf);
512 return -EFAULT; 507 return -EFAULT;
513 }
514 kfree(elf_buf);
515 } else { 508 } else {
516 if (kern_addr_valid(start)) { 509 if (kern_addr_valid(start)) {
517 unsigned long n; 510 unsigned long n;
518 511
519 n = copy_to_user(buffer, (char *)start, tsz); 512 /*
513 * Using bounce buffer to bypass the
514 * hardened user copy kernel text checks.
515 */
516 memcpy(buf, (char *) start, tsz);
517 n = copy_to_user(buffer, buf, tsz);
520 /* 518 /*
521 * We cannot distinguish between fault on source 519 * We cannot distinguish between fault on source
522 * and fault on destination. When this happens 520 * and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
549{ 547{
550 if (!capable(CAP_SYS_RAWIO)) 548 if (!capable(CAP_SYS_RAWIO))
551 return -EPERM; 549 return -EPERM;
550
551 filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
552 if (!filp->private_data)
553 return -ENOMEM;
554
552 if (kcore_need_update) 555 if (kcore_need_update)
553 kcore_update_ram(); 556 kcore_update_ram();
554 if (i_size_read(inode) != proc_root_kcore->size) { 557 if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
559 return 0; 562 return 0;
560} 563}
561 564
565static int release_kcore(struct inode *inode, struct file *file)
566{
567 kfree(file->private_data);
568 return 0;
569}
562 570
563static const struct file_operations proc_kcore_operations = { 571static const struct file_operations proc_kcore_operations = {
564 .read = read_kcore, 572 .read = read_kcore,
565 .open = open_kcore, 573 .open = open_kcore,
574 .release = release_kcore,
566 .llseek = default_llseek, 575 .llseek = default_llseek,
567}; 576};
568 577
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 09e18fdf61e5..b9a8c813e5e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
46 cached = 0; 46 cached = 0;
47 47
48 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 48 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
49 pages[lru] = global_page_state(NR_LRU_BASE + lru); 49 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
50 50
51 available = si_mem_available(); 51 available = si_mem_available();
52 52
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..f6fa99eca515 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
581 mss->anonymous_thp += HPAGE_PMD_SIZE; 581 mss->anonymous_thp += HPAGE_PMD_SIZE;
582 else if (PageSwapBacked(page)) 582 else if (PageSwapBacked(page))
583 mss->shmem_thp += HPAGE_PMD_SIZE; 583 mss->shmem_thp += HPAGE_PMD_SIZE;
584 else if (is_zone_device_page(page))
585 /* pass */;
584 else 586 else
585 VM_BUG_ON_PAGE(1, page); 587 VM_BUG_ON_PAGE(1, page);
586 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); 588 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 183a212694bf..12af0490322f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -27,9 +27,17 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/ramfs.h> 29#include <linux/ramfs.h>
30#include <linux/sched.h>
30 31
31#include "internal.h" 32#include "internal.h"
32 33
34static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
35 unsigned long addr, unsigned long len, unsigned long pgoff,
36 unsigned long flags)
37{
38 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
39}
40
33const struct file_operations ramfs_file_operations = { 41const struct file_operations ramfs_file_operations = {
34 .read_iter = generic_file_read_iter, 42 .read_iter = generic_file_read_iter,
35 .write_iter = generic_file_write_iter, 43 .write_iter = generic_file_write_iter,
@@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = {
38 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
39 .splice_write = iter_file_splice_write, 47 .splice_write = iter_file_splice_write,
40 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
49 .get_unmapped_area = ramfs_mmu_get_unmapped_area,
41}; 50};
42 51
43const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 19f532e7d35e..6dc4296eed62 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
223 size -= n; 223 size -= n;
224 buf += n; 224 buf += n;
225 copied += n; 225 copied += n;
226 if (!m->count) 226 if (!m->count) {
227 m->from = 0;
227 m->index++; 228 m->index++;
229 }
228 if (!size) 230 if (!size)
229 goto Done; 231 goto Done;
230 } 232 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f35523d4fa3a..b803213d1307 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
114 * If buf != of->prealloc_buf, we don't know how 114 * If buf != of->prealloc_buf, we don't know how
115 * large it is, so cannot safely pass it to ->show 115 * large it is, so cannot safely pass it to ->show
116 */ 116 */
117 if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) 117 if (WARN_ON_ONCE(buf != of->prealloc_buf))
118 return 0; 118 return 0;
119 len = ops->show(kobj, of->kn->priv, buf); 119 len = ops->show(kobj, of->kn->priv, buf);
120 if (pos) {
121 if (len <= pos)
122 return 0;
123 len -= pos;
124 memmove(buf, buf + pos, len);
125 }
120 return min(count, len); 126 return min(count, len);
121} 127}
122 128
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index b45345d701e7..51157da3f76e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
370 370
371 p = c->gap_lebs; 371 p = c->gap_lebs;
372 do { 372 do {
373 ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); 373 ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs);
374 written = layout_leb_in_gaps(c, p); 374 written = layout_leb_in_gaps(c, p);
375 if (written < 0) { 375 if (written < 0) {
376 err = written; 376 err = written;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e237811f09ce..11a004114eba 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
575 dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, 575 dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
576 inode->i_ino, dentry, size); 576 inode->i_ino, dentry, size);
577 577
578 return __ubifs_getxattr(inode, name, buffer, size); 578 name = xattr_full_name(handler, name);
579 return __ubifs_getxattr(inode, name, buffer, size);
579} 580}
580 581
581static int ubifs_xattr_set(const struct xattr_handler *handler, 582static int ubifs_xattr_set(const struct xattr_handler *handler,
@@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
586 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", 587 dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
587 name, inode->i_ino, dentry, size); 588 name, inode->i_ino, dentry, size);
588 589
590 name = xattr_full_name(handler, name);
591
589 if (value) 592 if (value)
590 return __ubifs_setxattr(inode, name, value, size, flags); 593 return __ubifs_setxattr(inode, name, value, size, flags);
591 else 594 else
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 776ae2f325d1..05b5243d89f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small(
1582 xfs_extlen_t *flenp, /* result length */ 1582 xfs_extlen_t *flenp, /* result length */
1583 int *stat) /* status: 0-freelist, 1-normal/none */ 1583 int *stat) /* status: 0-freelist, 1-normal/none */
1584{ 1584{
1585 struct xfs_owner_info oinfo;
1585 int error; 1586 int error;
1586 xfs_agblock_t fbno; 1587 xfs_agblock_t fbno;
1587 xfs_extlen_t flen; 1588 xfs_extlen_t flen;
@@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small(
1624 error0); 1625 error0);
1625 args->wasfromfl = 1; 1626 args->wasfromfl = 1;
1626 trace_xfs_alloc_small_freelist(args); 1627 trace_xfs_alloc_small_freelist(args);
1628
1629 /*
1630 * If we're feeding an AGFL block to something that
1631 * doesn't live in the free space, we need to clear
1632 * out the OWN_AG rmap.
1633 */
1634 xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
1635 error = xfs_rmap_free(args->tp, args->agbp, args->agno,
1636 fbno, 1, &oinfo);
1637 if (error)
1638 goto error0;
1639
1627 *stat = 0; 1640 *stat = 0;
1628 return 0; 1641 return 0;
1629 } 1642 }
@@ -2264,6 +2277,9 @@ xfs_alloc_log_agf(
2264 offsetof(xfs_agf_t, agf_longest), 2277 offsetof(xfs_agf_t, agf_longest),
2265 offsetof(xfs_agf_t, agf_btreeblks), 2278 offsetof(xfs_agf_t, agf_btreeblks),
2266 offsetof(xfs_agf_t, agf_uuid), 2279 offsetof(xfs_agf_t, agf_uuid),
2280 offsetof(xfs_agf_t, agf_rmap_blocks),
2281 /* needed so that we don't log the whole rest of the structure: */
2282 offsetof(xfs_agf_t, agf_spare64),
2267 sizeof(xfs_agf_t) 2283 sizeof(xfs_agf_t)
2268 }; 2284 };
2269 2285
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b5c213a051cd..08569792fe20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1814,6 +1814,10 @@ xfs_btree_lookup(
1814 1814
1815 XFS_BTREE_STATS_INC(cur, lookup); 1815 XFS_BTREE_STATS_INC(cur, lookup);
1816 1816
1817 /* No such thing as a zero-level tree. */
1818 if (cur->bc_nlevels == 0)
1819 return -EFSCORRUPTED;
1820
1817 block = NULL; 1821 block = NULL;
1818 keyno = 0; 1822 keyno = 0;
1819 1823
@@ -4554,15 +4558,22 @@ xfs_btree_simple_query_range(
4554 if (error) 4558 if (error)
4555 goto out; 4559 goto out;
4556 4560
4561 /* Nothing? See if there's anything to the right. */
4562 if (!stat) {
4563 error = xfs_btree_increment(cur, 0, &stat);
4564 if (error)
4565 goto out;
4566 }
4567
4557 while (stat) { 4568 while (stat) {
4558 /* Find the record. */ 4569 /* Find the record. */
4559 error = xfs_btree_get_rec(cur, &recp, &stat); 4570 error = xfs_btree_get_rec(cur, &recp, &stat);
4560 if (error || !stat) 4571 if (error || !stat)
4561 break; 4572 break;
4562 cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4563 4573
4564 /* Skip if high_key(rec) < low_key. */ 4574 /* Skip if high_key(rec) < low_key. */
4565 if (firstrec) { 4575 if (firstrec) {
4576 cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4566 firstrec = false; 4577 firstrec = false;
4567 diff = cur->bc_ops->diff_two_keys(cur, low_key, 4578 diff = cur->bc_ops->diff_two_keys(cur, low_key,
4568 &rec_key); 4579 &rec_key);
@@ -4571,6 +4582,7 @@ xfs_btree_simple_query_range(
4571 } 4582 }
4572 4583
4573 /* Stop if high_key < low_key(rec). */ 4584 /* Stop if high_key < low_key(rec). */
4585 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4574 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key); 4586 diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
4575 if (diff > 0) 4587 if (diff > 0)
4576 break; 4588 break;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 054a2032fdb3..c221d0ecd52e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -194,7 +194,7 @@ xfs_defer_trans_abort(
194 /* Abort intent items. */ 194 /* Abort intent items. */
195 list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { 195 list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
196 trace_xfs_defer_pending_abort(tp->t_mountp, dfp); 196 trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
197 if (dfp->dfp_committed) 197 if (!dfp->dfp_done)
198 dfp->dfp_type->abort_intent(dfp->dfp_intent); 198 dfp->dfp_type->abort_intent(dfp->dfp_intent);
199 } 199 }
200 200
@@ -290,7 +290,6 @@ xfs_defer_finish(
290 struct xfs_defer_pending *dfp; 290 struct xfs_defer_pending *dfp;
291 struct list_head *li; 291 struct list_head *li;
292 struct list_head *n; 292 struct list_head *n;
293 void *done_item = NULL;
294 void *state; 293 void *state;
295 int error = 0; 294 int error = 0;
296 void (*cleanup_fn)(struct xfs_trans *, void *, int); 295 void (*cleanup_fn)(struct xfs_trans *, void *, int);
@@ -309,19 +308,11 @@ xfs_defer_finish(
309 if (error) 308 if (error)
310 goto out; 309 goto out;
311 310
312 /* Mark all pending intents as committed. */
313 list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
314 if (dfp->dfp_committed)
315 break;
316 trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
317 dfp->dfp_committed = true;
318 }
319
320 /* Log an intent-done item for the first pending item. */ 311 /* Log an intent-done item for the first pending item. */
321 dfp = list_first_entry(&dop->dop_pending, 312 dfp = list_first_entry(&dop->dop_pending,
322 struct xfs_defer_pending, dfp_list); 313 struct xfs_defer_pending, dfp_list);
323 trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); 314 trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
324 done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent, 315 dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
325 dfp->dfp_count); 316 dfp->dfp_count);
326 cleanup_fn = dfp->dfp_type->finish_cleanup; 317 cleanup_fn = dfp->dfp_type->finish_cleanup;
327 318
@@ -331,7 +322,7 @@ xfs_defer_finish(
331 list_del(li); 322 list_del(li);
332 dfp->dfp_count--; 323 dfp->dfp_count--;
333 error = dfp->dfp_type->finish_item(*tp, dop, li, 324 error = dfp->dfp_type->finish_item(*tp, dop, li,
334 done_item, &state); 325 dfp->dfp_done, &state);
335 if (error) { 326 if (error) {
336 /* 327 /*
337 * Clean up after ourselves and jump out. 328 * Clean up after ourselves and jump out.
@@ -428,8 +419,8 @@ xfs_defer_add(
428 dfp = kmem_alloc(sizeof(struct xfs_defer_pending), 419 dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
429 KM_SLEEP | KM_NOFS); 420 KM_SLEEP | KM_NOFS);
430 dfp->dfp_type = defer_op_types[type]; 421 dfp->dfp_type = defer_op_types[type];
431 dfp->dfp_committed = false;
432 dfp->dfp_intent = NULL; 422 dfp->dfp_intent = NULL;
423 dfp->dfp_done = NULL;
433 dfp->dfp_count = 0; 424 dfp->dfp_count = 0;
434 INIT_LIST_HEAD(&dfp->dfp_work); 425 INIT_LIST_HEAD(&dfp->dfp_work);
435 list_add_tail(&dfp->dfp_list, &dop->dop_intake); 426 list_add_tail(&dfp->dfp_list, &dop->dop_intake);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index cc3981c48296..e96533d178cf 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -30,8 +30,8 @@ struct xfs_defer_op_type;
30struct xfs_defer_pending { 30struct xfs_defer_pending {
31 const struct xfs_defer_op_type *dfp_type; /* function pointers */ 31 const struct xfs_defer_op_type *dfp_type; /* function pointers */
32 struct list_head dfp_list; /* pending items */ 32 struct list_head dfp_list; /* pending items */
33 bool dfp_committed; /* committed trans? */
34 void *dfp_intent; /* log intent item */ 33 void *dfp_intent; /* log intent item */
34 void *dfp_done; /* log done item */
35 struct list_head dfp_work; /* work items */ 35 struct list_head dfp_work; /* work items */
36 unsigned int dfp_count; /* # extent items */ 36 unsigned int dfp_count; /* # extent items */
37}; 37};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f814d42c73b2..270fb5cf4fa1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -640,12 +640,15 @@ typedef struct xfs_agf {
640 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ 640 __be32 agf_btreeblks; /* # of blocks held in AGF btrees */
641 uuid_t agf_uuid; /* uuid of filesystem */ 641 uuid_t agf_uuid; /* uuid of filesystem */
642 642
643 __be32 agf_rmap_blocks; /* rmapbt blocks used */
644 __be32 agf_padding; /* padding */
645
643 /* 646 /*
644 * reserve some contiguous space for future logged fields before we add 647 * reserve some contiguous space for future logged fields before we add
645 * the unlogged fields. This makes the range logging via flags and 648 * the unlogged fields. This makes the range logging via flags and
646 * structure offsets much simpler. 649 * structure offsets much simpler.
647 */ 650 */
648 __be64 agf_spare64[16]; 651 __be64 agf_spare64[15];
649 652
650 /* unlogged fields, written during buffer writeback. */ 653 /* unlogged fields, written during buffer writeback. */
651 __be64 agf_lsn; /* last write sequence */ 654 __be64 agf_lsn; /* last write sequence */
@@ -670,7 +673,9 @@ typedef struct xfs_agf {
670#define XFS_AGF_LONGEST 0x00000400 673#define XFS_AGF_LONGEST 0x00000400
671#define XFS_AGF_BTREEBLKS 0x00000800 674#define XFS_AGF_BTREEBLKS 0x00000800
672#define XFS_AGF_UUID 0x00001000 675#define XFS_AGF_UUID 0x00001000
673#define XFS_AGF_NUM_BITS 13 676#define XFS_AGF_RMAP_BLOCKS 0x00002000
677#define XFS_AGF_SPARE64 0x00004000
678#define XFS_AGF_NUM_BITS 15
674#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) 679#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
675 680
676#define XFS_AGF_FLAGS \ 681#define XFS_AGF_FLAGS \
@@ -686,7 +691,9 @@ typedef struct xfs_agf {
686 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ 691 { XFS_AGF_FREEBLKS, "FREEBLKS" }, \
687 { XFS_AGF_LONGEST, "LONGEST" }, \ 692 { XFS_AGF_LONGEST, "LONGEST" }, \
688 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ 693 { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
689 { XFS_AGF_UUID, "UUID" } 694 { XFS_AGF_UUID, "UUID" }, \
695 { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \
696 { XFS_AGF_SPARE64, "SPARE64" }
690 697
691/* disk block (xfs_daddr_t) in the AG */ 698/* disk block (xfs_daddr_t) in the AG */
692#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) 699#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index bc1faebc84ec..17b8eeb34ac8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block(
98 union xfs_btree_ptr *new, 98 union xfs_btree_ptr *new,
99 int *stat) 99 int *stat)
100{ 100{
101 struct xfs_buf *agbp = cur->bc_private.a.agbp;
102 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
101 int error; 103 int error;
102 xfs_agblock_t bno; 104 xfs_agblock_t bno;
103 105
@@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block(
124 126
125 xfs_trans_agbtree_delta(cur->bc_tp, 1); 127 xfs_trans_agbtree_delta(cur->bc_tp, 1);
126 new->s = cpu_to_be32(bno); 128 new->s = cpu_to_be32(bno);
129 be32_add_cpu(&agf->agf_rmap_blocks, 1);
130 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
127 131
128 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 132 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
129 *stat = 1; 133 *stat = 1;
@@ -143,6 +147,8 @@ xfs_rmapbt_free_block(
143 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 147 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
144 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, 148 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
145 bno, 1); 149 bno, 1);
150 be32_add_cpu(&agf->agf_rmap_blocks, -1);
151 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
146 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 152 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
147 if (error) 153 if (error)
148 return error; 154 return error;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0e3d4f5ec33c..4aecc5fefe96 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -583,7 +583,8 @@ xfs_sb_verify(
583 * Only check the in progress field for the primary superblock as 583 * Only check the in progress field for the primary superblock as
584 * mkfs.xfs doesn't clear it from secondary superblocks. 584 * mkfs.xfs doesn't clear it from secondary superblocks.
585 */ 585 */
586 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, 586 return xfs_mount_validate_sb(mp, &sb,
587 bp->b_maps[0].bm_bn == XFS_SB_DADDR,
587 check_version); 588 check_version);
588} 589}
589 590
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 47a318ce82e0..b5b9bffe3520 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -115,7 +115,6 @@ xfs_buf_ioacct_dec(
115 if (!(bp->b_flags & _XBF_IN_FLIGHT)) 115 if (!(bp->b_flags & _XBF_IN_FLIGHT))
116 return; 116 return;
117 117
118 ASSERT(bp->b_flags & XBF_ASYNC);
119 bp->b_flags &= ~_XBF_IN_FLIGHT; 118 bp->b_flags &= ~_XBF_IN_FLIGHT;
120 percpu_counter_dec(&bp->b_target->bt_io_count); 119 percpu_counter_dec(&bp->b_target->bt_io_count);
121} 120}
@@ -1612,7 +1611,7 @@ xfs_wait_buftarg(
1612 */ 1611 */
1613 while (percpu_counter_sum(&btp->bt_io_count)) 1612 while (percpu_counter_sum(&btp->bt_io_count))
1614 delay(100); 1613 delay(100);
1615 drain_workqueue(btp->bt_mount->m_buf_workqueue); 1614 flush_workqueue(btp->bt_mount->m_buf_workqueue);
1616 1615
1617 /* loop until there is nothing left on the lru list. */ 1616 /* loop until there is nothing left on the lru list. */
1618 while (list_lru_count(&btp->bt_lru)) { 1617 while (list_lru_count(&btp->bt_lru)) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5bb04e6..e612a0233710 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -741,9 +741,20 @@ xfs_file_dax_write(
741 * page is inserted into the pagecache when we have to serve a write 741 * page is inserted into the pagecache when we have to serve a write
742 * fault on a hole. It should never be dirtied and can simply be 742 * fault on a hole. It should never be dirtied and can simply be
743 * dropped from the pagecache once we get real data for the page. 743 * dropped from the pagecache once we get real data for the page.
744 *
745 * XXX: This is racy against mmap, and there's nothing we can do about
746 * it. dax_do_io() should really do this invalidation internally as
747 * it will know if we've allocated over a holei for this specific IO and
748 * if so it needs to update the mapping tree and invalidate existing
749 * PTEs over the newly allocated range. Remove this invalidation when
750 * dax_do_io() is fixed up.
744 */ 751 */
745 if (mapping->nrpages) { 752 if (mapping->nrpages) {
746 ret = invalidate_inode_pages2(mapping); 753 loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
754
755 ret = invalidate_inode_pages2_range(mapping,
756 iocb->ki_pos >> PAGE_SHIFT,
757 end >> PAGE_SHIFT);
747 WARN_ON_ONCE(ret); 758 WARN_ON_ONCE(ret);
748 } 759 }
749 760
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0f96847b90e1..0b7f986745c1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -248,6 +248,7 @@ xfs_growfs_data_private(
248 agf->agf_roots[XFS_BTNUM_RMAPi] = 248 agf->agf_roots[XFS_BTNUM_RMAPi] =
249 cpu_to_be32(XFS_RMAP_BLOCK(mp)); 249 cpu_to_be32(XFS_RMAP_BLOCK(mp));
250 agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); 250 agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
251 agf->agf_rmap_blocks = cpu_to_be32(1);
251 } 252 }
252 253
253 agf->agf_flfirst = cpu_to_be32(1); 254 agf->agf_flfirst = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2114d53df433..2af0dda1c978 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -715,12 +715,16 @@ xfs_iomap_write_allocate(
715 * is in the delayed allocation extent on which we sit 715 * is in the delayed allocation extent on which we sit
716 * but before our buffer starts. 716 * but before our buffer starts.
717 */ 717 */
718
719 nimaps = 0; 718 nimaps = 0;
720 while (nimaps == 0) { 719 while (nimaps == 0) {
721 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 720 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
722 721 /*
723 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, 722 * We have already reserved space for the extent and any
723 * indirect blocks when creating the delalloc extent,
724 * there is no need to reserve space in this transaction
725 * again.
726 */
727 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
724 0, XFS_TRANS_RESERVE, &tp); 728 0, XFS_TRANS_RESERVE, &tp);
725 if (error) 729 if (error)
726 return error; 730 return error;
@@ -1037,20 +1041,14 @@ xfs_file_iomap_begin(
1037 return error; 1041 return error;
1038 1042
1039 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1043 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1040 xfs_bmbt_to_iomap(ip, iomap, &imap);
1041 } else if (nimaps) {
1042 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1043 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1044 xfs_bmbt_to_iomap(ip, iomap, &imap);
1045 } else { 1044 } else {
1045 ASSERT(nimaps);
1046
1046 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1047 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1047 trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); 1048 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1048 iomap->blkno = IOMAP_NULL_BLOCK;
1049 iomap->type = IOMAP_HOLE;
1050 iomap->offset = offset;
1051 iomap->length = length;
1052 } 1049 }
1053 1050
1051 xfs_bmbt_to_iomap(ip, iomap, &imap);
1054 return 0; 1052 return 0;
1055} 1053}
1056 1054
@@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = {
1112 .iomap_begin = xfs_file_iomap_begin, 1110 .iomap_begin = xfs_file_iomap_begin,
1113 .iomap_end = xfs_file_iomap_end, 1111 .iomap_end = xfs_file_iomap_end,
1114}; 1112};
1113
1114static int
1115xfs_xattr_iomap_begin(
1116 struct inode *inode,
1117 loff_t offset,
1118 loff_t length,
1119 unsigned flags,
1120 struct iomap *iomap)
1121{
1122 struct xfs_inode *ip = XFS_I(inode);
1123 struct xfs_mount *mp = ip->i_mount;
1124 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
1125 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
1126 struct xfs_bmbt_irec imap;
1127 int nimaps = 1, error = 0;
1128 unsigned lockmode;
1129
1130 if (XFS_FORCED_SHUTDOWN(mp))
1131 return -EIO;
1132
1133 lockmode = xfs_ilock_data_map_shared(ip);
1134
1135 /* if there are no attribute fork or extents, return ENOENT */
1136 if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
1137 error = -ENOENT;
1138 goto out_unlock;
1139 }
1140
1141 ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
1142 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1143 &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
1144out_unlock:
1145 xfs_iunlock(ip, lockmode);
1146
1147 if (!error) {
1148 ASSERT(nimaps);
1149 xfs_bmbt_to_iomap(ip, iomap, &imap);
1150 }
1151
1152 return error;
1153}
1154
1155struct iomap_ops xfs_xattr_iomap_ops = {
1156 .iomap_begin = xfs_xattr_iomap_begin,
1157};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e066d045e2ff..fb8aca3d69ab 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
35 struct xfs_bmbt_irec *); 35 struct xfs_bmbt_irec *);
36 36
37extern struct iomap_ops xfs_iomap_ops; 37extern struct iomap_ops xfs_iomap_ops;
38extern struct iomap_ops xfs_xattr_iomap_ops;
38 39
39#endif /* __XFS_IOMAP_H__*/ 40#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f84ed50..b24c3102fa93 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1009,7 +1009,14 @@ xfs_vn_fiemap(
1009 int error; 1009 int error;
1010 1010
1011 xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); 1011 xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
1012 error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); 1012 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
1013 fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
1014 error = iomap_fiemap(inode, fieinfo, start, length,
1015 &xfs_xattr_iomap_ops);
1016 } else {
1017 error = iomap_fiemap(inode, fieinfo, start, length,
1018 &xfs_iomap_ops);
1019 }
1013 xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); 1020 xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
1014 1021
1015 return error; 1022 return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 24ef83ef04de..fd6be45b3a1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1574,9 +1574,16 @@ xfs_fs_fill_super(
1574 } 1574 }
1575 } 1575 }
1576 1576
1577 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) 1577 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
1578 if (mp->m_sb.sb_rblocks) {
1579 xfs_alert(mp,
1580 "EXPERIMENTAL reverse mapping btree not compatible with realtime device!");
1581 error = -EINVAL;
1582 goto out_filestream_unmount;
1583 }
1578 xfs_alert(mp, 1584 xfs_alert(mp,
1579 "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); 1585 "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
1586 }
1580 1587
1581 error = xfs_mountfs(mp); 1588 error = xfs_mountfs(mp);
1582 if (error) 1589 if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 551b7e26980c..d303a665dba9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1298DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); 1298DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
1299DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 1299DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
1300DEFINE_IOMAP_EVENT(xfs_iomap_found); 1300DEFINE_IOMAP_EVENT(xfs_iomap_found);
1301DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
1302 1301
1303DECLARE_EVENT_CLASS(xfs_simple_io_class, 1302DECLARE_EVENT_CLASS(xfs_simple_io_class,
1304 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1303 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -2296,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
2296 __entry->dev = mp ? mp->m_super->s_dev : 0; 2295 __entry->dev = mp ? mp->m_super->s_dev : 0;
2297 __entry->type = dfp->dfp_type->type; 2296 __entry->type = dfp->dfp_type->type;
2298 __entry->intent = dfp->dfp_intent; 2297 __entry->intent = dfp->dfp_intent;
2299 __entry->committed = dfp->dfp_committed; 2298 __entry->committed = dfp->dfp_done != NULL;
2300 __entry->nr = dfp->dfp_count; 2299 __entry->nr = dfp->dfp_count;
2301 ), 2300 ),
2302 TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", 2301 TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",