diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/raid5.c | 246 |
1 files changed, 128 insertions, 118 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 970eb03ec6c0..6d3a2a09cd90 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -2696,13 +2696,136 @@ static int make_request(request_queue_t *q, struct bio * bi) | |||
2696 | return 0; | 2696 | return 0; |
2697 | } | 2697 | } |
2698 | 2698 | ||
2699 | /* FIXME go_faster isn't used */ | 2699 | static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) |
2700 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | ||
2701 | { | 2700 | { |
2701 | /* reshaping is quite different to recovery/resync so it is | ||
2702 | * handled quite separately ... here. | ||
2703 | * | ||
2704 | * On each call to sync_request, we gather one chunk worth of | ||
2705 | * destination stripes and flag them as expanding. | ||
2706 | * Then we find all the source stripes and request reads. | ||
2707 | * As the reads complete, handle_stripe will copy the data | ||
2708 | * into the destination stripe and release that stripe. | ||
2709 | */ | ||
2702 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | 2710 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; |
2703 | struct stripe_head *sh; | 2711 | struct stripe_head *sh; |
2704 | int pd_idx; | 2712 | int pd_idx; |
2705 | sector_t first_sector, last_sector; | 2713 | sector_t first_sector, last_sector; |
2714 | int raid_disks; | ||
2715 | int data_disks; | ||
2716 | int i; | ||
2717 | int dd_idx; | ||
2718 | sector_t writepos, safepos, gap; | ||
2719 | |||
2720 | if (sector_nr == 0 && | ||
2721 | conf->expand_progress != 0) { | ||
2722 | /* restarting in the middle, skip the initial sectors */ | ||
2723 | sector_nr = conf->expand_progress; | ||
2724 | sector_div(sector_nr, conf->raid_disks-1); | ||
2725 | *skipped = 1; | ||
2726 | return sector_nr; | ||
2727 | } | ||
2728 | |||
2729 | /* we update the metadata when there is more than 3Meg | ||
2730 | * in the block range (that is rather arbitrary, should | ||
2731 | * probably be time based) or when the data about to be | ||
2732 | * copied would over-write the source of the data at | ||
2733 | * the front of the range. | ||
2734 | * i.e. one new_stripe forward from expand_progress new_maps | ||
2735 | * to after where expand_lo old_maps to | ||
2736 | */ | ||
2737 | writepos = conf->expand_progress + | ||
2738 | conf->chunk_size/512*(conf->raid_disks-1); | ||
2739 | sector_div(writepos, conf->raid_disks-1); | ||
2740 | safepos = conf->expand_lo; | ||
2741 | sector_div(safepos, conf->previous_raid_disks-1); | ||
2742 | gap = conf->expand_progress - conf->expand_lo; | ||
2743 | |||
2744 | if (writepos >= safepos || | ||
2745 | gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { | ||
2746 | /* Cannot proceed until we've updated the superblock... */ | ||
2747 | wait_event(conf->wait_for_overlap, | ||
2748 | atomic_read(&conf->reshape_stripes)==0); | ||
2749 | mddev->reshape_position = conf->expand_progress; | ||
2750 | mddev->sb_dirty = 1; | ||
2751 | md_wakeup_thread(mddev->thread); | ||
2752 | wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || | ||
2753 | kthread_should_stop()); | ||
2754 | spin_lock_irq(&conf->device_lock); | ||
2755 | conf->expand_lo = mddev->reshape_position; | ||
2756 | spin_unlock_irq(&conf->device_lock); | ||
2757 | wake_up(&conf->wait_for_overlap); | ||
2758 | } | ||
2759 | |||
2760 | for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { | ||
2761 | int j; | ||
2762 | int skipped = 0; | ||
2763 | pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); | ||
2764 | sh = get_active_stripe(conf, sector_nr+i, | ||
2765 | conf->raid_disks, pd_idx, 0); | ||
2766 | set_bit(STRIPE_EXPANDING, &sh->state); | ||
2767 | atomic_inc(&conf->reshape_stripes); | ||
2768 | /* If any of this stripe is beyond the end of the old | ||
2769 | * array, then we need to zero those blocks | ||
2770 | */ | ||
2771 | for (j=sh->disks; j--;) { | ||
2772 | sector_t s; | ||
2773 | if (j == sh->pd_idx) | ||
2774 | continue; | ||
2775 | s = compute_blocknr(sh, j); | ||
2776 | if (s < (mddev->array_size<<1)) { | ||
2777 | skipped = 1; | ||
2778 | continue; | ||
2779 | } | ||
2780 | memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); | ||
2781 | set_bit(R5_Expanded, &sh->dev[j].flags); | ||
2782 | set_bit(R5_UPTODATE, &sh->dev[j].flags); | ||
2783 | } | ||
2784 | if (!skipped) { | ||
2785 | set_bit(STRIPE_EXPAND_READY, &sh->state); | ||
2786 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2787 | } | ||
2788 | release_stripe(sh); | ||
2789 | } | ||
2790 | spin_lock_irq(&conf->device_lock); | ||
2791 | conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); | ||
2792 | spin_unlock_irq(&conf->device_lock); | ||
2793 | /* Ok, those stripe are ready. We can start scheduling | ||
2794 | * reads on the source stripes. | ||
2795 | * The source stripes are determined by mapping the first and last | ||
2796 | * block on the destination stripes. | ||
2797 | */ | ||
2798 | raid_disks = conf->previous_raid_disks; | ||
2799 | data_disks = raid_disks - 1; | ||
2800 | first_sector = | ||
2801 | raid5_compute_sector(sector_nr*(conf->raid_disks-1), | ||
2802 | raid_disks, data_disks, | ||
2803 | &dd_idx, &pd_idx, conf); | ||
2804 | last_sector = | ||
2805 | raid5_compute_sector((sector_nr+conf->chunk_size/512) | ||
2806 | *(conf->raid_disks-1) -1, | ||
2807 | raid_disks, data_disks, | ||
2808 | &dd_idx, &pd_idx, conf); | ||
2809 | if (last_sector >= (mddev->size<<1)) | ||
2810 | last_sector = (mddev->size<<1)-1; | ||
2811 | while (first_sector <= last_sector) { | ||
2812 | pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); | ||
2813 | sh = get_active_stripe(conf, first_sector, | ||
2814 | conf->previous_raid_disks, pd_idx, 0); | ||
2815 | set_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
2816 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2817 | release_stripe(sh); | ||
2818 | first_sector += STRIPE_SECTORS; | ||
2819 | } | ||
2820 | return conf->chunk_size>>9; | ||
2821 | } | ||
2822 | |||
2823 | /* FIXME go_faster isn't used */ | ||
2824 | static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | ||
2825 | { | ||
2826 | raid5_conf_t *conf = (raid5_conf_t *) mddev->private; | ||
2827 | struct stripe_head *sh; | ||
2828 | int pd_idx; | ||
2706 | int raid_disks = conf->raid_disks; | 2829 | int raid_disks = conf->raid_disks; |
2707 | int data_disks = raid_disks - conf->max_degraded; | 2830 | int data_disks = raid_disks - conf->max_degraded; |
2708 | sector_t max_sector = mddev->size << 1; | 2831 | sector_t max_sector = mddev->size << 1; |
@@ -2728,122 +2851,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2728 | return 0; | 2851 | return 0; |
2729 | } | 2852 | } |
2730 | 2853 | ||
2731 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { | 2854 | if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
2732 | /* reshaping is quite different to recovery/resync so it is | 2855 | return reshape_request(mddev, sector_nr, skipped); |
2733 | * handled quite separately ... here. | 2856 | |
2734 | * | ||
2735 | * On each call to sync_request, we gather one chunk worth of | ||
2736 | * destination stripes and flag them as expanding. | ||
2737 | * Then we find all the source stripes and request reads. | ||
2738 | * As the reads complete, handle_stripe will copy the data | ||
2739 | * into the destination stripe and release that stripe. | ||
2740 | */ | ||
2741 | int i; | ||
2742 | int dd_idx; | ||
2743 | sector_t writepos, safepos, gap; | ||
2744 | |||
2745 | if (sector_nr == 0 && | ||
2746 | conf->expand_progress != 0) { | ||
2747 | /* restarting in the middle, skip the initial sectors */ | ||
2748 | sector_nr = conf->expand_progress; | ||
2749 | sector_div(sector_nr, conf->raid_disks-1); | ||
2750 | *skipped = 1; | ||
2751 | return sector_nr; | ||
2752 | } | ||
2753 | |||
2754 | /* we update the metadata when there is more than 3Meg | ||
2755 | * in the block range (that is rather arbitrary, should | ||
2756 | * probably be time based) or when the data about to be | ||
2757 | * copied would over-write the source of the data at | ||
2758 | * the front of the range. | ||
2759 | * i.e. one new_stripe forward from expand_progress new_maps | ||
2760 | * to after where expand_lo old_maps to | ||
2761 | */ | ||
2762 | writepos = conf->expand_progress + | ||
2763 | conf->chunk_size/512*(conf->raid_disks-1); | ||
2764 | sector_div(writepos, conf->raid_disks-1); | ||
2765 | safepos = conf->expand_lo; | ||
2766 | sector_div(safepos, conf->previous_raid_disks-1); | ||
2767 | gap = conf->expand_progress - conf->expand_lo; | ||
2768 | |||
2769 | if (writepos >= safepos || | ||
2770 | gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { | ||
2771 | /* Cannot proceed until we've updated the superblock... */ | ||
2772 | wait_event(conf->wait_for_overlap, | ||
2773 | atomic_read(&conf->reshape_stripes)==0); | ||
2774 | mddev->reshape_position = conf->expand_progress; | ||
2775 | mddev->sb_dirty = 1; | ||
2776 | md_wakeup_thread(mddev->thread); | ||
2777 | wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || | ||
2778 | kthread_should_stop()); | ||
2779 | spin_lock_irq(&conf->device_lock); | ||
2780 | conf->expand_lo = mddev->reshape_position; | ||
2781 | spin_unlock_irq(&conf->device_lock); | ||
2782 | wake_up(&conf->wait_for_overlap); | ||
2783 | } | ||
2784 | |||
2785 | for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { | ||
2786 | int j; | ||
2787 | int skipped = 0; | ||
2788 | pd_idx = stripe_to_pdidx(sector_nr+i, conf, conf->raid_disks); | ||
2789 | sh = get_active_stripe(conf, sector_nr+i, | ||
2790 | conf->raid_disks, pd_idx, 0); | ||
2791 | set_bit(STRIPE_EXPANDING, &sh->state); | ||
2792 | atomic_inc(&conf->reshape_stripes); | ||
2793 | /* If any of this stripe is beyond the end of the old | ||
2794 | * array, then we need to zero those blocks | ||
2795 | */ | ||
2796 | for (j=sh->disks; j--;) { | ||
2797 | sector_t s; | ||
2798 | if (j == sh->pd_idx) | ||
2799 | continue; | ||
2800 | s = compute_blocknr(sh, j); | ||
2801 | if (s < (mddev->array_size<<1)) { | ||
2802 | skipped = 1; | ||
2803 | continue; | ||
2804 | } | ||
2805 | memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); | ||
2806 | set_bit(R5_Expanded, &sh->dev[j].flags); | ||
2807 | set_bit(R5_UPTODATE, &sh->dev[j].flags); | ||
2808 | } | ||
2809 | if (!skipped) { | ||
2810 | set_bit(STRIPE_EXPAND_READY, &sh->state); | ||
2811 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2812 | } | ||
2813 | release_stripe(sh); | ||
2814 | } | ||
2815 | spin_lock_irq(&conf->device_lock); | ||
2816 | conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); | ||
2817 | spin_unlock_irq(&conf->device_lock); | ||
2818 | /* Ok, those stripe are ready. We can start scheduling | ||
2819 | * reads on the source stripes. | ||
2820 | * The source stripes are determined by mapping the first and last | ||
2821 | * block on the destination stripes. | ||
2822 | */ | ||
2823 | raid_disks = conf->previous_raid_disks; | ||
2824 | data_disks = raid_disks - 1; | ||
2825 | first_sector = | ||
2826 | raid5_compute_sector(sector_nr*(conf->raid_disks-1), | ||
2827 | raid_disks, data_disks, | ||
2828 | &dd_idx, &pd_idx, conf); | ||
2829 | last_sector = | ||
2830 | raid5_compute_sector((sector_nr+conf->chunk_size/512) | ||
2831 | *(conf->raid_disks-1) -1, | ||
2832 | raid_disks, data_disks, | ||
2833 | &dd_idx, &pd_idx, conf); | ||
2834 | if (last_sector >= (mddev->size<<1)) | ||
2835 | last_sector = (mddev->size<<1)-1; | ||
2836 | while (first_sector <= last_sector) { | ||
2837 | pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); | ||
2838 | sh = get_active_stripe(conf, first_sector, | ||
2839 | conf->previous_raid_disks, pd_idx, 0); | ||
2840 | set_bit(STRIPE_EXPAND_SOURCE, &sh->state); | ||
2841 | set_bit(STRIPE_HANDLE, &sh->state); | ||
2842 | release_stripe(sh); | ||
2843 | first_sector += STRIPE_SECTORS; | ||
2844 | } | ||
2845 | return conf->chunk_size>>9; | ||
2846 | } | ||
2847 | /* if there is too many failed drives and we are trying | 2857 | /* if there is too many failed drives and we are trying |
2848 | * to resync, then assert that we are finished, because there is | 2858 | * to resync, then assert that we are finished, because there is |
2849 | * nothing we can do. | 2859 | * nothing we can do. |