diff options
author | Dave Kleikamp <shaggy@austin.ibm.com> | 2006-01-24 15:34:47 -0500 |
---|---|---|
committer | Dave Kleikamp <shaggy@austin.ibm.com> | 2006-01-24 15:34:47 -0500 |
commit | 0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch) | |
tree | 7b42490a676cf39ae0691b6859ecf7fd410f229b /mm/hugetlb.c | |
parent | 4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff) | |
parent | 3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff) |
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 200 |
1 files changed, 147 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 728e9bda12ea..b21d78c941b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -11,6 +11,9 @@ | |||
11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
12 | #include <linux/nodemask.h> | 12 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | ||
15 | #include <linux/cpuset.h> | ||
16 | |||
14 | #include <asm/page.h> | 17 | #include <asm/page.h> |
15 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
16 | 19 | ||
@@ -22,6 +25,10 @@ unsigned long max_huge_pages; | |||
22 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 25 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
23 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 26 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
24 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 27 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
28 | |||
29 | /* | ||
30 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | ||
31 | */ | ||
25 | static DEFINE_SPINLOCK(hugetlb_lock); | 32 | static DEFINE_SPINLOCK(hugetlb_lock); |
26 | 33 | ||
27 | static void enqueue_huge_page(struct page *page) | 34 | static void enqueue_huge_page(struct page *page) |
@@ -32,18 +39,22 @@ static void enqueue_huge_page(struct page *page) | |||
32 | free_huge_pages_node[nid]++; | 39 | free_huge_pages_node[nid]++; |
33 | } | 40 | } |
34 | 41 | ||
35 | static struct page *dequeue_huge_page(void) | 42 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
43 | unsigned long address) | ||
36 | { | 44 | { |
37 | int nid = numa_node_id(); | 45 | int nid = numa_node_id(); |
38 | struct page *page = NULL; | 46 | struct page *page = NULL; |
47 | struct zonelist *zonelist = huge_zonelist(vma, address); | ||
48 | struct zone **z; | ||
39 | 49 | ||
40 | if (list_empty(&hugepage_freelists[nid])) { | 50 | for (z = zonelist->zones; *z; z++) { |
41 | for (nid = 0; nid < MAX_NUMNODES; ++nid) | 51 | nid = (*z)->zone_pgdat->node_id; |
42 | if (!list_empty(&hugepage_freelists[nid])) | 52 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && |
43 | break; | 53 | !list_empty(&hugepage_freelists[nid])) |
54 | break; | ||
44 | } | 55 | } |
45 | if (nid >= 0 && nid < MAX_NUMNODES && | 56 | |
46 | !list_empty(&hugepage_freelists[nid])) { | 57 | if (*z) { |
47 | page = list_entry(hugepage_freelists[nid].next, | 58 | page = list_entry(hugepage_freelists[nid].next, |
48 | struct page, lru); | 59 | struct page, lru); |
49 | list_del(&page->lru); | 60 | list_del(&page->lru); |
@@ -61,8 +72,10 @@ static struct page *alloc_fresh_huge_page(void) | |||
61 | HUGETLB_PAGE_ORDER); | 72 | HUGETLB_PAGE_ORDER); |
62 | nid = (nid + 1) % num_online_nodes(); | 73 | nid = (nid + 1) % num_online_nodes(); |
63 | if (page) { | 74 | if (page) { |
75 | spin_lock(&hugetlb_lock); | ||
64 | nr_huge_pages++; | 76 | nr_huge_pages++; |
65 | nr_huge_pages_node[page_to_nid(page)]++; | 77 | nr_huge_pages_node[page_to_nid(page)]++; |
78 | spin_unlock(&hugetlb_lock); | ||
66 | } | 79 | } |
67 | return page; | 80 | return page; |
68 | } | 81 | } |
@@ -79,13 +92,13 @@ void free_huge_page(struct page *page) | |||
79 | spin_unlock(&hugetlb_lock); | 92 | spin_unlock(&hugetlb_lock); |
80 | } | 93 | } |
81 | 94 | ||
82 | struct page *alloc_huge_page(void) | 95 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) |
83 | { | 96 | { |
84 | struct page *page; | 97 | struct page *page; |
85 | int i; | 98 | int i; |
86 | 99 | ||
87 | spin_lock(&hugetlb_lock); | 100 | spin_lock(&hugetlb_lock); |
88 | page = dequeue_huge_page(); | 101 | page = dequeue_huge_page(vma, addr); |
89 | if (!page) { | 102 | if (!page) { |
90 | spin_unlock(&hugetlb_lock); | 103 | spin_unlock(&hugetlb_lock); |
91 | return NULL; | 104 | return NULL; |
@@ -188,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
188 | spin_lock(&hugetlb_lock); | 201 | spin_lock(&hugetlb_lock); |
189 | try_to_free_low(count); | 202 | try_to_free_low(count); |
190 | while (count < nr_huge_pages) { | 203 | while (count < nr_huge_pages) { |
191 | struct page *page = dequeue_huge_page(); | 204 | struct page *page = dequeue_huge_page(NULL, 0); |
192 | if (!page) | 205 | if (!page) |
193 | break; | 206 | break; |
194 | update_and_free_page(page); | 207 | update_and_free_page(page); |
@@ -255,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
255 | .nopage = hugetlb_nopage, | 268 | .nopage = hugetlb_nopage, |
256 | }; | 269 | }; |
257 | 270 | ||
258 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 271 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
272 | int writable) | ||
259 | { | 273 | { |
260 | pte_t entry; | 274 | pte_t entry; |
261 | 275 | ||
262 | if (vma->vm_flags & VM_WRITE) { | 276 | if (writable) { |
263 | entry = | 277 | entry = |
264 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 278 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
265 | } else { | 279 | } else { |
@@ -271,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
271 | return entry; | 285 | return entry; |
272 | } | 286 | } |
273 | 287 | ||
288 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
289 | unsigned long address, pte_t *ptep) | ||
290 | { | ||
291 | pte_t entry; | ||
292 | |||
293 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
294 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
295 | update_mmu_cache(vma, address, entry); | ||
296 | lazy_mmu_prot_update(entry); | ||
297 | } | ||
298 | |||
299 | |||
274 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 300 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
275 | struct vm_area_struct *vma) | 301 | struct vm_area_struct *vma) |
276 | { | 302 | { |
277 | pte_t *src_pte, *dst_pte, entry; | 303 | pte_t *src_pte, *dst_pte, entry; |
278 | struct page *ptepage; | 304 | struct page *ptepage; |
279 | unsigned long addr; | 305 | unsigned long addr; |
306 | int cow; | ||
307 | |||
308 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
280 | 309 | ||
281 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 310 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
282 | src_pte = huge_pte_offset(src, addr); | 311 | src_pte = huge_pte_offset(src, addr); |
@@ -288,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
288 | spin_lock(&dst->page_table_lock); | 317 | spin_lock(&dst->page_table_lock); |
289 | spin_lock(&src->page_table_lock); | 318 | spin_lock(&src->page_table_lock); |
290 | if (!pte_none(*src_pte)) { | 319 | if (!pte_none(*src_pte)) { |
320 | if (cow) | ||
321 | ptep_set_wrprotect(src, addr, src_pte); | ||
291 | entry = *src_pte; | 322 | entry = *src_pte; |
292 | ptepage = pte_page(entry); | 323 | ptepage = pte_page(entry); |
293 | get_page(ptepage); | 324 | get_page(ptepage); |
@@ -339,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
339 | flush_tlb_range(vma, start, end); | 370 | flush_tlb_range(vma, start, end); |
340 | } | 371 | } |
341 | 372 | ||
342 | static struct page *find_lock_huge_page(struct address_space *mapping, | 373 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
343 | unsigned long idx) | 374 | unsigned long address, pte_t *ptep, pte_t pte) |
344 | { | 375 | { |
345 | struct page *page; | 376 | struct page *old_page, *new_page; |
346 | int err; | 377 | int i, avoidcopy; |
347 | struct inode *inode = mapping->host; | ||
348 | unsigned long size; | ||
349 | 378 | ||
350 | retry: | 379 | old_page = pte_page(pte); |
351 | page = find_lock_page(mapping, idx); | ||
352 | if (page) | ||
353 | goto out; | ||
354 | 380 | ||
355 | /* Check to make sure the mapping hasn't been truncated */ | 381 | /* If no-one else is actually using this page, avoid the copy |
356 | size = i_size_read(inode) >> HPAGE_SHIFT; | 382 | * and just make the page writable */ |
357 | if (idx >= size) | 383 | avoidcopy = (page_count(old_page) == 1); |
358 | goto out; | 384 | if (avoidcopy) { |
385 | set_huge_ptep_writable(vma, address, ptep); | ||
386 | return VM_FAULT_MINOR; | ||
387 | } | ||
359 | 388 | ||
360 | if (hugetlb_get_quota(mapping)) | 389 | page_cache_get(old_page); |
361 | goto out; | 390 | new_page = alloc_huge_page(vma, address); |
362 | page = alloc_huge_page(); | 391 | |
363 | if (!page) { | 392 | if (!new_page) { |
364 | hugetlb_put_quota(mapping); | 393 | page_cache_release(old_page); |
365 | goto out; | 394 | |
395 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
396 | * could cause the kernel to go killing other | ||
397 | * processes which won't help the hugepage situation | ||
398 | * at all (?) */ | ||
399 | return VM_FAULT_SIGBUS; | ||
366 | } | 400 | } |
367 | 401 | ||
368 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 402 | spin_unlock(&mm->page_table_lock); |
369 | if (err) { | 403 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) |
370 | put_page(page); | 404 | copy_user_highpage(new_page + i, old_page + i, |
371 | hugetlb_put_quota(mapping); | 405 | address + i*PAGE_SIZE); |
372 | if (err == -EEXIST) | 406 | spin_lock(&mm->page_table_lock); |
373 | goto retry; | 407 | |
374 | page = NULL; | 408 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
409 | if (likely(pte_same(*ptep, pte))) { | ||
410 | /* Break COW */ | ||
411 | set_huge_pte_at(mm, address, ptep, | ||
412 | make_huge_pte(vma, new_page, 1)); | ||
413 | /* Make the old page be freed below */ | ||
414 | new_page = old_page; | ||
375 | } | 415 | } |
376 | out: | 416 | page_cache_release(new_page); |
377 | return page; | 417 | page_cache_release(old_page); |
418 | return VM_FAULT_MINOR; | ||
378 | } | 419 | } |
379 | 420 | ||
380 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 421 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
381 | unsigned long address, int write_access) | 422 | unsigned long address, pte_t *ptep, int write_access) |
382 | { | 423 | { |
383 | int ret = VM_FAULT_SIGBUS; | 424 | int ret = VM_FAULT_SIGBUS; |
384 | unsigned long idx; | 425 | unsigned long idx; |
385 | unsigned long size; | 426 | unsigned long size; |
386 | pte_t *pte; | ||
387 | struct page *page; | 427 | struct page *page; |
388 | struct address_space *mapping; | 428 | struct address_space *mapping; |
389 | 429 | pte_t new_pte; | |
390 | pte = huge_pte_alloc(mm, address); | ||
391 | if (!pte) | ||
392 | goto out; | ||
393 | 430 | ||
394 | mapping = vma->vm_file->f_mapping; | 431 | mapping = vma->vm_file->f_mapping; |
395 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 432 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
@@ -399,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
399 | * Use page lock to guard against racing truncation | 436 | * Use page lock to guard against racing truncation |
400 | * before we get page_table_lock. | 437 | * before we get page_table_lock. |
401 | */ | 438 | */ |
402 | page = find_lock_huge_page(mapping, idx); | 439 | retry: |
403 | if (!page) | 440 | page = find_lock_page(mapping, idx); |
404 | goto out; | 441 | if (!page) { |
442 | if (hugetlb_get_quota(mapping)) | ||
443 | goto out; | ||
444 | page = alloc_huge_page(vma, address); | ||
445 | if (!page) { | ||
446 | hugetlb_put_quota(mapping); | ||
447 | goto out; | ||
448 | } | ||
449 | |||
450 | if (vma->vm_flags & VM_SHARED) { | ||
451 | int err; | ||
452 | |||
453 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
454 | if (err) { | ||
455 | put_page(page); | ||
456 | hugetlb_put_quota(mapping); | ||
457 | if (err == -EEXIST) | ||
458 | goto retry; | ||
459 | goto out; | ||
460 | } | ||
461 | } else | ||
462 | lock_page(page); | ||
463 | } | ||
405 | 464 | ||
406 | spin_lock(&mm->page_table_lock); | 465 | spin_lock(&mm->page_table_lock); |
407 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 466 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
@@ -409,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
409 | goto backout; | 468 | goto backout; |
410 | 469 | ||
411 | ret = VM_FAULT_MINOR; | 470 | ret = VM_FAULT_MINOR; |
412 | if (!pte_none(*pte)) | 471 | if (!pte_none(*ptep)) |
413 | goto backout; | 472 | goto backout; |
414 | 473 | ||
415 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 474 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
416 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | 475 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
476 | && (vma->vm_flags & VM_SHARED))); | ||
477 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
478 | |||
479 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
480 | /* Optimization, do the COW without a second fault */ | ||
481 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
482 | } | ||
483 | |||
417 | spin_unlock(&mm->page_table_lock); | 484 | spin_unlock(&mm->page_table_lock); |
418 | unlock_page(page); | 485 | unlock_page(page); |
419 | out: | 486 | out: |
@@ -427,6 +494,33 @@ backout: | |||
427 | goto out; | 494 | goto out; |
428 | } | 495 | } |
429 | 496 | ||
497 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
498 | unsigned long address, int write_access) | ||
499 | { | ||
500 | pte_t *ptep; | ||
501 | pte_t entry; | ||
502 | int ret; | ||
503 | |||
504 | ptep = huge_pte_alloc(mm, address); | ||
505 | if (!ptep) | ||
506 | return VM_FAULT_OOM; | ||
507 | |||
508 | entry = *ptep; | ||
509 | if (pte_none(entry)) | ||
510 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | ||
511 | |||
512 | ret = VM_FAULT_MINOR; | ||
513 | |||
514 | spin_lock(&mm->page_table_lock); | ||
515 | /* Check for a racing update before calling hugetlb_cow */ | ||
516 | if (likely(pte_same(entry, *ptep))) | ||
517 | if (write_access && !pte_write(entry)) | ||
518 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
519 | spin_unlock(&mm->page_table_lock); | ||
520 | |||
521 | return ret; | ||
522 | } | ||
523 | |||
430 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 524 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
431 | struct page **pages, struct vm_area_struct **vmas, | 525 | struct page **pages, struct vm_area_struct **vmas, |
432 | unsigned long *position, int *length, int i) | 526 | unsigned long *position, int *length, int i) |