summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMax Kellermann <mk@cm4all.com>2019-07-12 10:18:06 -0400
committerTrond Myklebust <trond.myklebust@hammerspace.com>2019-07-12 16:01:37 -0400
commitdb531db951f950b86d274cc8ed7b21b9e2240036 (patch)
treefcf499969157b18d3b6fe366490897213f508dfe
parenta101b043c44dfcb63bed7f29a675e9fa0259005e (diff)
Revert "NFS: readdirplus optimization by cache mechanism" (memleak)
This reverts commit be4c2d4723a4a637f0d1b4f7c66447141a4b3564. That commit caused a severe memory leak in nfs_readdir_make_qstr(). When listing a directory with more than 100 files (this is how many struct nfs_cache_array_entry elements fit in one 4kB page), all allocated file name strings past those 100 leak. The root of the leakage is that those string pointers are managed in pages which are never linked into the page cache. fs/nfs/dir.c puts pages into the page cache by calling read_cache_page(); the callback function nfs_readdir_filler() will then fill the given page struct which was passed to it, which is already linked in the page cache (by do_read_cache_page() calling add_to_page_cache_lru()). Commit be4c2d4723a4 added another (local) array of allocated pages, to be filled with more data, instead of discarding excess items received from the NFS server. Those additional pages can be used by the next nfs_readdir_filler() call (from within the same nfs_readdir() call). The leak happens when some of those additional pages are never used (copied to the page cache using copy_highpage()). The pages will be freed by nfs_readdir_free_pages(), but their contents will not. The commit did not invoke nfs_readdir_clear_array() (and doing so would have been dangerous, because it did not track which of those pages were already copied to the page cache, risking double free bugs). How to reproduce the leak: - Use a kernel with CONFIG_SLUB_DEBUG_ON. - Create a directory on a NFS mount with more than 100 files with names long enough to use the "kmalloc-32" slab (so we can easily look up the allocation counts): for i in `seq 110`; do touch ${i}_0123456789abcdef; done - Drop all caches: echo 3 >/proc/sys/vm/drop_caches - Check the allocation counter: grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls 30564391 nfs_readdir_add_to_array+0x73/0xd0 age=534558/4791307/6540952 pid=370-1048386 cpus=0-47 nodes=0-1 - Request a directory listing and check the allocation counters again: ls [...] grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls 30564511 nfs_readdir_add_to_array+0x73/0xd0 age=207/4792999/6542663 pid=370-1048386 cpus=0-47 nodes=0-1 There are now 120 new allocations. - Drop all caches and check the counters again: echo 3 >/proc/sys/vm/drop_caches grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls 30564401 nfs_readdir_add_to_array+0x73/0xd0 age=735/4793524/6543176 pid=370-1048386 cpus=0-47 nodes=0-1 110 allocations are gone, but 10 have leaked and will never be freed. Unhelpfully, those allocations are explicitly excluded from KMEMLEAK, that's why my initial attempts with KMEMLEAK were not successful: /* * Avoid a kmemleak false positive. The pointer to the name is stored * in a page cache page which kmemleak does not scan. */ kmemleak_not_leak(string->name); It would be possible to solve this bug without reverting the whole commit: - keep track of which pages were not used, and call nfs_readdir_clear_array() on them, or - manually link those pages into the page cache But for now I have decided to just revert the commit, because the real fix would require complex considerations, risking more dangerous (crash) bugs, which may seem unsuitable for the stable branches. Signed-off-by: Max Kellermann <mk@cm4all.com> Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
-rw-r--r--fs/nfs/dir.c90
-rw-r--r--fs/nfs/internal.h3
2 files changed, 7 insertions, 86 deletions
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bd1f9555447b..8d501093660f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -144,19 +144,12 @@ struct nfs_cache_array {
144 struct nfs_cache_array_entry array[0]; 144 struct nfs_cache_array_entry array[0];
145}; 145};
146 146
147struct readdirvec {
148 unsigned long nr;
149 unsigned long index;
150 struct page *pages[NFS_MAX_READDIR_RAPAGES];
151};
152
153typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); 147typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
154typedef struct { 148typedef struct {
155 struct file *file; 149 struct file *file;
156 struct page *page; 150 struct page *page;
157 struct dir_context *ctx; 151 struct dir_context *ctx;
158 unsigned long page_index; 152 unsigned long page_index;
159 struct readdirvec pvec;
160 u64 *dir_cookie; 153 u64 *dir_cookie;
161 u64 last_cookie; 154 u64 last_cookie;
162 loff_t current_index; 155 loff_t current_index;
@@ -536,10 +529,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
536 struct nfs_cache_array *array; 529 struct nfs_cache_array *array;
537 unsigned int count = 0; 530 unsigned int count = 0;
538 int status; 531 int status;
539 int max_rapages = NFS_MAX_READDIR_RAPAGES;
540
541 desc->pvec.index = desc->page_index;
542 desc->pvec.nr = 0;
543 532
544 scratch = alloc_page(GFP_KERNEL); 533 scratch = alloc_page(GFP_KERNEL);
545 if (scratch == NULL) 534 if (scratch == NULL)
@@ -564,40 +553,20 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
564 if (desc->plus) 553 if (desc->plus)
565 nfs_prime_dcache(file_dentry(desc->file), entry); 554 nfs_prime_dcache(file_dentry(desc->file), entry);
566 555
567 status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); 556 status = nfs_readdir_add_to_array(entry, page);
568 if (status == -ENOSPC) {
569 desc->pvec.nr++;
570 if (desc->pvec.nr == max_rapages)
571 break;
572 status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
573 }
574 if (status != 0) 557 if (status != 0)
575 break; 558 break;
576 } while (!entry->eof); 559 } while (!entry->eof);
577 560
578 /*
579 * page and desc->pvec.pages[0] are valid, don't need to check
580 * whether or not to be NULL.
581 */
582 copy_highpage(page, desc->pvec.pages[0]);
583
584out_nopages: 561out_nopages:
585 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { 562 if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
586 array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]); 563 array = kmap(page);
587 array->eof_index = array->size; 564 array->eof_index = array->size;
588 status = 0; 565 status = 0;
589 kunmap_atomic(array); 566 kunmap(page);
590 } 567 }
591 568
592 put_page(scratch); 569 put_page(scratch);
593
594 /*
595 * desc->pvec.nr > 0 means at least one page was completely filled,
596 * we should return -ENOSPC. Otherwise function
597 * nfs_readdir_xdr_to_array will enter infinite loop.
598 */
599 if (desc->pvec.nr > 0)
600 return -ENOSPC;
601 return status; 570 return status;
602} 571}
603 572
@@ -631,24 +600,6 @@ out_freepages:
631 return -ENOMEM; 600 return -ENOMEM;
632} 601}
633 602
634/*
635 * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure.
636 */
637static
638void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc)
639{
640 struct nfs_cache_array *array;
641 int max_rapages = NFS_MAX_READDIR_RAPAGES;
642 int index;
643
644 for (index = 0; index < max_rapages; index++) {
645 array = kmap_atomic(desc->pvec.pages[index]);
646 memset(array, 0, sizeof(struct nfs_cache_array));
647 array->eof_index = -1;
648 kunmap_atomic(array);
649 }
650}
651
652static 603static
653int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) 604int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
654{ 605{
@@ -659,12 +610,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
659 int status = -ENOMEM; 610 int status = -ENOMEM;
660 unsigned int array_size = ARRAY_SIZE(pages); 611 unsigned int array_size = ARRAY_SIZE(pages);
661 612
662 /*
663 * This means we hit readdir rdpages miss, the preallocated rdpages
664 * are useless, the preallocate rdpages should be reinitialized.
665 */
666 nfs_readdir_rapages_init(desc);
667
668 entry.prev_cookie = 0; 613 entry.prev_cookie = 0;
669 entry.cookie = desc->last_cookie; 614 entry.cookie = desc->last_cookie;
670 entry.eof = 0; 615 entry.eof = 0;
@@ -725,24 +670,9 @@ int nfs_readdir_filler(void *data, struct page* page)
725 struct inode *inode = file_inode(desc->file); 670 struct inode *inode = file_inode(desc->file);
726 int ret; 671 int ret;
727 672
728 /* 673 ret = nfs_readdir_xdr_to_array(desc, page, inode);
729 * If desc->page_index in range desc->pvec.index and 674 if (ret < 0)
730 * desc->pvec.index + desc->pvec.nr, we get readdir cache hit. 675 goto error;
731 */
732 if (desc->page_index >= desc->pvec.index &&
733 desc->page_index < (desc->pvec.index + desc->pvec.nr)) {
734 /*
735 * page and desc->pvec.pages[x] are valid, don't need to check
736 * whether or not to be NULL.
737 */
738 copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]);
739 ret = 0;
740 } else {
741 ret = nfs_readdir_xdr_to_array(desc, page, inode);
742 if (ret < 0)
743 goto error;
744 }
745
746 SetPageUptodate(page); 676 SetPageUptodate(page);
747 677
748 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { 678 if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
@@ -907,7 +837,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
907 *desc = &my_desc; 837 *desc = &my_desc;
908 struct nfs_open_dir_context *dir_ctx = file->private_data; 838 struct nfs_open_dir_context *dir_ctx = file->private_data;
909 int res = 0; 839 int res = 0;
910 int max_rapages = NFS_MAX_READDIR_RAPAGES;
911 840
912 dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", 841 dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
913 file, (long long)ctx->pos); 842 file, (long long)ctx->pos);
@@ -927,12 +856,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
927 desc->decode = NFS_PROTO(inode)->decode_dirent; 856 desc->decode = NFS_PROTO(inode)->decode_dirent;
928 desc->plus = nfs_use_readdirplus(inode, ctx); 857 desc->plus = nfs_use_readdirplus(inode, ctx);
929 858
930 res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
931 if (res < 0)
932 return -ENOMEM;
933
934 nfs_readdir_rapages_init(desc);
935
936 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) 859 if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
937 res = nfs_revalidate_mapping(inode, file->f_mapping); 860 res = nfs_revalidate_mapping(inode, file->f_mapping);
938 if (res < 0) 861 if (res < 0)
@@ -968,7 +891,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
968 break; 891 break;
969 } while (!desc->eof); 892 } while (!desc->eof);
970out: 893out:
971 nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
972 if (res > 0) 894 if (res > 0)
973 res = 0; 895 res = 0;
974 dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); 896 dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f359e760ed41..a2346a2f8361 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,8 +69,7 @@ struct nfs_clone_mount {
69 * Maximum number of pages that readdir can use for creating 69 * Maximum number of pages that readdir can use for creating
70 * a vmapped array of pages. 70 * a vmapped array of pages.
71 */ 71 */
72#define NFS_MAX_READDIR_PAGES 64 72#define NFS_MAX_READDIR_PAGES 8
73#define NFS_MAX_READDIR_RAPAGES 8
74 73
75struct nfs_client_initdata { 74struct nfs_client_initdata {
76 unsigned long init_flags; 75 unsigned long init_flags;