diff options
author | Debarshi Dutta <ddutta@nvidia.com> | 2019-04-30 05:41:31 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2019-05-09 17:42:33 -0400 |
commit | 6509bb49da19ba9b19e3df64e473b01d54fd310d (patch) | |
tree | b34d19c88fc122f369b1f22094d9a5e22c67df92 /drivers/gpu/nvgpu/gk20a | |
parent | 4d8ad643d67ac4044f76976c4085a35fcc5d4095 (diff) |
gpu: nvgpu: protect recovery with engines_reset_mutex
Rename gr_reset_mutex to engines_reset_mutex and acquire it
before initiating recovery. Recovery running in parallel with
engine reset is not recommended.
On hitting engine reset, h/w drops the ctxsw_status to INVALID in
fifo_engine_status register. Also while the engine is held in reset
h/w passes busy/idle straight through. fifo_engine_status registers
are correct in that there is no context switch outstanding
as the CTXSW is aborted when reset is asserted.
Use deferred_reset_mutex to protect deferred_reset_pending variable
If deferred_reset_pending is true then acquire engines_reset_mutex
and call gk20a_fifo_deferred_reset.
gk20a_fifo_deferred_reset would also check the value of
deferred_reset_pending before initiating reset process
Bug 2092051
Bug 2429295
Bug 2484211
Bug 1890287
Change-Id: I47de669a6203e0b2e9a8237ec4e4747339b9837c
Signed-off-by: Seema Khowala <seemaj@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/2022373
Signed-off-by: Debarshi Dutta <ddutta@nvidia.com>
(cherry-picked from cb91bf1e13740023903282d1c2271d9154e940ba
in dev-main)
Reviewed-on: https://git-master.nvidia.com/r/2024901
GVS: Gerrit_Virtual_Submit
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 68 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | 2 |
2 files changed, 55 insertions, 15 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index b96372b4..5aca7d62 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -910,9 +910,9 @@ int gk20a_init_fifo_setup_sw_common(struct gk20a *g) | |||
910 | return err; | 910 | return err; |
911 | } | 911 | } |
912 | 912 | ||
913 | err = nvgpu_mutex_init(&f->gr_reset_mutex); | 913 | err = nvgpu_mutex_init(&f->engines_reset_mutex); |
914 | if (err) { | 914 | if (err) { |
915 | nvgpu_err(g, "failed to init gr_reset_mutex"); | 915 | nvgpu_err(g, "failed to init engines_reset_mutex"); |
916 | return err; | 916 | return err; |
917 | } | 917 | } |
918 | 918 | ||
@@ -1581,14 +1581,22 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch) | |||
1581 | { | 1581 | { |
1582 | unsigned long engine_id, engines = 0U; | 1582 | unsigned long engine_id, engines = 0U; |
1583 | struct tsg_gk20a *tsg; | 1583 | struct tsg_gk20a *tsg; |
1584 | bool deferred_reset_pending; | ||
1585 | struct fifo_gk20a *f = &g->fifo; | ||
1584 | 1586 | ||
1585 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); | 1587 | nvgpu_mutex_acquire(&g->dbg_sessions_lock); |
1586 | gr_gk20a_disable_ctxsw(g); | ||
1587 | 1588 | ||
1588 | if (!g->fifo.deferred_reset_pending) { | 1589 | nvgpu_mutex_acquire(&f->deferred_reset_mutex); |
1589 | goto clean_up; | 1590 | deferred_reset_pending = g->fifo.deferred_reset_pending; |
1591 | nvgpu_mutex_release(&f->deferred_reset_mutex); | ||
1592 | |||
1593 | if (!deferred_reset_pending) { | ||
1594 | nvgpu_mutex_release(&g->dbg_sessions_lock); | ||
1595 | return 0; | ||
1590 | } | 1596 | } |
1591 | 1597 | ||
1598 | gr_gk20a_disable_ctxsw(g); | ||
1599 | |||
1592 | tsg = tsg_gk20a_from_ch(ch); | 1600 | tsg = tsg_gk20a_from_ch(ch); |
1593 | if (tsg != NULL) { | 1601 | if (tsg != NULL) { |
1594 | engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); | 1602 | engines = gk20a_fifo_engines_on_id(g, tsg->tsgid, true); |
@@ -1610,8 +1618,10 @@ int gk20a_fifo_deferred_reset(struct gk20a *g, struct channel_gk20a *ch) | |||
1610 | } | 1618 | } |
1611 | } | 1619 | } |
1612 | 1620 | ||
1621 | nvgpu_mutex_acquire(&f->deferred_reset_mutex); | ||
1613 | g->fifo.deferred_fault_engines = 0; | 1622 | g->fifo.deferred_fault_engines = 0; |
1614 | g->fifo.deferred_reset_pending = false; | 1623 | g->fifo.deferred_reset_pending = false; |
1624 | nvgpu_mutex_release(&f->deferred_reset_mutex); | ||
1615 | 1625 | ||
1616 | clean_up: | 1626 | clean_up: |
1617 | gr_gk20a_enable_ctxsw(g); | 1627 | gr_gk20a_enable_ctxsw(g); |
@@ -1632,9 +1642,10 @@ static bool gk20a_fifo_handle_mmu_fault_locked( | |||
1632 | bool verbose = true; | 1642 | bool verbose = true; |
1633 | u32 grfifo_ctl; | 1643 | u32 grfifo_ctl; |
1634 | 1644 | ||
1635 | nvgpu_log_fn(g, " "); | 1645 | bool deferred_reset_pending = false; |
1646 | struct fifo_gk20a *f = &g->fifo; | ||
1636 | 1647 | ||
1637 | g->fifo.deferred_reset_pending = false; | 1648 | nvgpu_log_fn(g, " "); |
1638 | 1649 | ||
1639 | /* Disable power management */ | 1650 | /* Disable power management */ |
1640 | if (g->support_pmu) { | 1651 | if (g->support_pmu) { |
@@ -1661,6 +1672,9 @@ static bool gk20a_fifo_handle_mmu_fault_locked( | |||
1661 | gk20a_debug_dump(g); | 1672 | gk20a_debug_dump(g); |
1662 | } | 1673 | } |
1663 | 1674 | ||
1675 | nvgpu_mutex_acquire(&f->deferred_reset_mutex); | ||
1676 | g->fifo.deferred_reset_pending = false; | ||
1677 | nvgpu_mutex_release(&f->deferred_reset_mutex); | ||
1664 | 1678 | ||
1665 | /* go through all faulted engines */ | 1679 | /* go through all faulted engines */ |
1666 | for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) { | 1680 | for_each_set_bit(engine_mmu_fault_id, &fault_id, 32) { |
@@ -1761,17 +1775,17 @@ static bool gk20a_fifo_handle_mmu_fault_locked( | |||
1761 | g->fifo.deferred_fault_engines |= BIT(engine_id); | 1775 | g->fifo.deferred_fault_engines |= BIT(engine_id); |
1762 | 1776 | ||
1763 | /* handled during channel free */ | 1777 | /* handled during channel free */ |
1778 | nvgpu_mutex_acquire(&f->deferred_reset_mutex); | ||
1764 | g->fifo.deferred_reset_pending = true; | 1779 | g->fifo.deferred_reset_pending = true; |
1780 | nvgpu_mutex_release(&f->deferred_reset_mutex); | ||
1781 | |||
1782 | deferred_reset_pending = true; | ||
1783 | |||
1765 | nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, | 1784 | nvgpu_log(g, gpu_dbg_intr | gpu_dbg_gpu_dbg, |
1766 | "sm debugger attached," | 1785 | "sm debugger attached," |
1767 | " deferring channel recovery to channel free"); | 1786 | " deferring channel recovery to channel free"); |
1768 | } else { | 1787 | } else { |
1769 | /* if lock is already taken, a reset is taking place | 1788 | gk20a_fifo_reset_engine(g, engine_id); |
1770 | so no need to repeat */ | ||
1771 | if (nvgpu_mutex_tryacquire(&g->fifo.gr_reset_mutex)) { | ||
1772 | gk20a_fifo_reset_engine(g, engine_id); | ||
1773 | nvgpu_mutex_release(&g->fifo.gr_reset_mutex); | ||
1774 | } | ||
1775 | } | 1789 | } |
1776 | } | 1790 | } |
1777 | 1791 | ||
@@ -1784,7 +1798,7 @@ static bool gk20a_fifo_handle_mmu_fault_locked( | |||
1784 | * Disable the channel/TSG from hw and increment syncpoints. | 1798 | * Disable the channel/TSG from hw and increment syncpoints. |
1785 | */ | 1799 | */ |
1786 | if (tsg) { | 1800 | if (tsg) { |
1787 | if (g->fifo.deferred_reset_pending) { | 1801 | if (deferred_reset_pending) { |
1788 | gk20a_disable_tsg(tsg); | 1802 | gk20a_disable_tsg(tsg); |
1789 | } else { | 1803 | } else { |
1790 | if (!fake_fault) { | 1804 | if (!fake_fault) { |
@@ -1847,6 +1861,9 @@ static bool gk20a_fifo_handle_mmu_fault( | |||
1847 | 1861 | ||
1848 | nvgpu_log_fn(g, " "); | 1862 | nvgpu_log_fn(g, " "); |
1849 | 1863 | ||
1864 | nvgpu_log_info(g, "acquire engines_reset_mutex"); | ||
1865 | nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); | ||
1866 | |||
1850 | nvgpu_log_info(g, "acquire runlist_lock for all runlists"); | 1867 | nvgpu_log_info(g, "acquire runlist_lock for all runlists"); |
1851 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { | 1868 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { |
1852 | nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); | 1869 | nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); |
@@ -1859,6 +1876,10 @@ static bool gk20a_fifo_handle_mmu_fault( | |||
1859 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { | 1876 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { |
1860 | nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); | 1877 | nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); |
1861 | } | 1878 | } |
1879 | |||
1880 | nvgpu_log_info(g, "release engines_reset_mutex"); | ||
1881 | nvgpu_mutex_release(&g->fifo.engines_reset_mutex); | ||
1882 | |||
1862 | return verbose; | 1883 | return verbose; |
1863 | } | 1884 | } |
1864 | 1885 | ||
@@ -1954,6 +1975,16 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, | |||
1954 | g->ops.fifo.disable_tsg(tsg); | 1975 | g->ops.fifo.disable_tsg(tsg); |
1955 | 1976 | ||
1956 | /* | 1977 | /* |
1978 | * On hitting engine reset, h/w drops the ctxsw_status to INVALID in | ||
1979 | * fifo_engine_status register. Also while the engine is held in reset | ||
1980 | * h/w passes busy/idle straight through. fifo_engine_status registers | ||
1981 | * are correct in that there is no context switch outstanding | ||
1982 | * as the CTXSW is aborted when reset is asserted. | ||
1983 | */ | ||
1984 | nvgpu_log_info(g, "acquire engines_reset_mutex"); | ||
1985 | nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); | ||
1986 | |||
1987 | /* | ||
1957 | * stop context switching to prevent engine assignments from | 1988 | * stop context switching to prevent engine assignments from |
1958 | * changing until engine status is checked to make sure tsg | 1989 | * changing until engine status is checked to make sure tsg |
1959 | * being recovered is not loaded on the engines | 1990 | * being recovered is not loaded on the engines |
@@ -1980,6 +2011,9 @@ void gk20a_fifo_recover_tsg(struct gk20a *g, struct tsg_gk20a *tsg, | |||
1980 | } | 2011 | } |
1981 | } | 2012 | } |
1982 | 2013 | ||
2014 | nvgpu_log_info(g, "release engines_reset_mutex"); | ||
2015 | nvgpu_mutex_release(&g->fifo.engines_reset_mutex); | ||
2016 | |||
1983 | if (engines) { | 2017 | if (engines) { |
1984 | gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, | 2018 | gk20a_fifo_recover(g, engines, tsg->tsgid, true, true, verbose, |
1985 | rc_type); | 2019 | rc_type); |
@@ -2030,6 +2064,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, | |||
2030 | bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false; | 2064 | bool id_is_tsg = (id_type == ID_TYPE_TSG) ? true : false; |
2031 | u32 rlid; | 2065 | u32 rlid; |
2032 | 2066 | ||
2067 | nvgpu_log_info(g, "acquire engines_reset_mutex"); | ||
2068 | nvgpu_mutex_acquire(&g->fifo.engines_reset_mutex); | ||
2069 | |||
2033 | nvgpu_log_info(g, "acquire runlist_lock for all runlists"); | 2070 | nvgpu_log_info(g, "acquire runlist_lock for all runlists"); |
2034 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { | 2071 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { |
2035 | nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); | 2072 | nvgpu_mutex_acquire(&g->fifo.runlist_info[rlid].runlist_lock); |
@@ -2094,6 +2131,9 @@ void gk20a_fifo_teardown_ch_tsg(struct gk20a *g, u32 __engine_ids, | |||
2094 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { | 2131 | for (rlid = 0; rlid < g->fifo.max_runlists; rlid++) { |
2095 | nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); | 2132 | nvgpu_mutex_release(&g->fifo.runlist_info[rlid].runlist_lock); |
2096 | } | 2133 | } |
2134 | |||
2135 | nvgpu_log_info(g, "release engines_reset_mutex"); | ||
2136 | nvgpu_mutex_release(&g->fifo.engines_reset_mutex); | ||
2097 | } | 2137 | } |
2098 | 2138 | ||
2099 | void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, | 2139 | void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids, |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h index 0c9d9101..26365cae 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h | |||
@@ -184,7 +184,7 @@ struct fifo_gk20a { | |||
184 | /* zero-kref'd channels here */ | 184 | /* zero-kref'd channels here */ |
185 | struct nvgpu_list_node free_chs; | 185 | struct nvgpu_list_node free_chs; |
186 | struct nvgpu_mutex free_chs_mutex; | 186 | struct nvgpu_mutex free_chs_mutex; |
187 | struct nvgpu_mutex gr_reset_mutex; | 187 | struct nvgpu_mutex engines_reset_mutex; |
188 | 188 | ||
189 | struct tsg_gk20a *tsg; | 189 | struct tsg_gk20a *tsg; |
190 | struct nvgpu_mutex tsg_inuse_mutex; | 190 | struct nvgpu_mutex tsg_inuse_mutex; |