summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYang Shi <yang.shi@linux.alibaba.com>2019-06-13 18:56:05 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-06-13 23:34:56 -0400
commit7a30df49f63ad92318ddf1f7498d1129a77dd4bd (patch)
treebbb0655d43339b0b93aa53176fc493b8089e01a6
parentbe99ca2716972a712cde46092c54dee5e6192bf8 (diff)
mm: mmu_gather: remove __tlb_reset_range() for force flush
A few new fields were added to mmu_gather to make TLB flush smarter for huge page by telling what level of page table is changed. __tlb_reset_range() is used to reset all these page table state to unchanged, which is called by TLB flush for parallel mapping changes for the same range under non-exclusive lock (i.e. read mmap_sem). Before commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap"), the syscalls (e.g. MADV_DONTNEED, MADV_FREE) which may update PTEs in parallel don't remove page tables. But, the forementioned commit may do munmap() under read mmap_sem and free page tables. This may result in program hang on aarch64 reported by Jan Stancek. The problem could be reproduced by his test program with slightly modified below. ---8<--- static int map_size = 4096; static int num_iter = 500; static long threads_total; static void *distant_area; void *map_write_unmap(void *ptr) { int *fd = ptr; unsigned char *map_address; int i, j = 0; for (i = 0; i < num_iter; i++) { map_address = mmap(distant_area, (size_t) map_size, PROT_WRITE | PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (map_address == MAP_FAILED) { perror("mmap"); exit(1); } for (j = 0; j < map_size; j++) map_address[j] = 'b'; if (munmap(map_address, map_size) == -1) { perror("munmap"); exit(1); } } return NULL; } void *dummy(void *ptr) { return NULL; } int main(void) { pthread_t thid[2]; /* hint for mmap in map_write_unmap() */ distant_area = mmap(0, DISTANT_MMAP_SIZE, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); munmap(distant_area, (size_t)DISTANT_MMAP_SIZE); distant_area += DISTANT_MMAP_SIZE / 2; while (1) { pthread_create(&thid[0], NULL, map_write_unmap, NULL); pthread_create(&thid[1], NULL, dummy, NULL); pthread_join(thid[0], NULL); pthread_join(thid[1], NULL); } } ---8<--- The program may bring in parallel execution like below: t1 t2 munmap(map_address) downgrade_write(&mm->mmap_sem); unmap_region() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); free_pgtables() tlb->freed_tables = 1 tlb->cleared_pmds = 1 pthread_exit() madvise(thread_stack, 8M, MADV_DONTNEED) zap_page_range() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); tlb_finish_mmu() if (mm_tlb_flush_nested(tlb->mm)) __tlb_reset_range() __tlb_reset_range() would reset freed_tables and cleared_* bits, but this may cause inconsistency for munmap() which do free page tables. Then it may result in some architectures, e.g. aarch64, may not flush TLB completely as expected to have stale TLB entries remained. Use fullmm flush since it yields much better performance on aarch64 and non-fullmm doesn't yields significant difference on x86. The original proposed fix came from Jan Stancek who mainly debugged this issue, I just wrapped up everything together. Jan's testing results: v5.2-rc2-24-gbec7550cca10 -------------------------- mean stddev real 37.382 2.780 user 1.420 0.078 sys 54.658 1.855 v5.2-rc2-24-gbec7550cca10 + "mm: mmu_gather: remove __tlb_reset_range() for force flush" ---------------------------------------------------------------------------------------_ mean stddev real 37.119 2.105 user 1.548 0.087 sys 55.698 1.357 [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1558322252-113575-1-git-send-email-yang.shi@linux.alibaba.com Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com> Signed-off-by: Jan Stancek <jstancek@redhat.com> Reported-by: Jan Stancek <jstancek@redhat.com> Tested-by: Jan Stancek <jstancek@redhat.com> Suggested-by: Will Deacon <will.deacon@arm.com> Tested-by: Will Deacon <will.deacon@arm.com> Acked-by: Will Deacon <will.deacon@arm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Nick Piggin <npiggin@gmail.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: Nadav Amit <namit@vmware.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> [4.20+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/mmu_gather.c24
1 files changed, 19 insertions, 5 deletions
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 99740e1dd273..8c943a6e1696 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -245,14 +245,28 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
245{ 245{
246 /* 246 /*
247 * If there are parallel threads are doing PTE changes on same range 247 * If there are parallel threads are doing PTE changes on same range
248 * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB 248 * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
249 * flush by batching, a thread has stable TLB entry can fail to flush 249 * flush by batching, one thread may end up seeing inconsistent PTEs
250 * the TLB by observing pte_none|!pte_dirty, for example so flush TLB 250 * and result in having stale TLB entries. So flush TLB forcefully
251 * forcefully if we detect parallel PTE batching threads. 251 * if we detect parallel PTE batching threads.
252 *
253 * However, some syscalls, e.g. munmap(), may free page tables, this
254 * needs force flush everything in the given range. Otherwise this
255 * may result in having stale TLB entries for some architectures,
256 * e.g. aarch64, that could specify flush what level TLB.
252 */ 257 */
253 if (mm_tlb_flush_nested(tlb->mm)) { 258 if (mm_tlb_flush_nested(tlb->mm)) {
259 /*
260 * The aarch64 yields better performance with fullmm by
261 * avoiding multiple CPUs spamming TLBI messages at the
262 * same time.
263 *
264 * On x86 non-fullmm doesn't yield significant difference
265 * against fullmm.
266 */
267 tlb->fullmm = 1;
254 __tlb_reset_range(tlb); 268 __tlb_reset_range(tlb);
255 __tlb_adjust_range(tlb, start, end - start); 269 tlb->freed_tables = 1;
256 } 270 }
257 271
258 tlb_flush_mmu(tlb); 272 tlb_flush_mmu(tlb);