diff options
author | Thomas Fleury <tfleury@nvidia.com> | 2016-08-17 20:26:30 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2016-08-29 19:14:29 -0400 |
commit | 5286fd525731d19dfa07d5e6e49e8d0eef233531 (patch) | |
tree | 6c64a7976d8e8795fb6bfda75cdfc7b058a4ecf8 /drivers/gpu/nvgpu | |
parent | 06780e0681b34ec570346fe5d4bdaf7a23f08a36 (diff) |
gpu: nvgpu: fix ctxsw timeout handling for TSGs
While collecting failing engine data, id type (is_tsg) was not
set for ctxsw and save engine states. This could result in some
ctxsw timeout interrupts to be ignored (id reported with wrong
is_tsg).
For TSGs, check if we made some progress on any of the channels
before kicking fifo recovery.
Bug 200228310
Jira EVLR-597
Change-Id: I231549ae68317919532de0f87effb78ee9c119c6
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1204035
(cherry picked from commit 7221d256fd7e9b418f7789b3d81eede8faa16f0b)
Reviewed-on: http://git-master/r/1204037
Reviewed-by: Richard Zhao <rizhao@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.c | 5 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/channel_gk20a.h | 2 | ||||
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 156 |
3 files changed, 127 insertions, 36 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c index 41fced99..d4cf6915 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c | |||
@@ -1539,16 +1539,19 @@ static inline u32 gp_free_count(struct channel_gk20a *c) | |||
1539 | } | 1539 | } |
1540 | 1540 | ||
1541 | bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, | 1541 | bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, |
1542 | u32 timeout_delta_ms) | 1542 | u32 timeout_delta_ms, bool *progress) |
1543 | { | 1543 | { |
1544 | u32 gpfifo_get = update_gp_get(ch->g, ch); | 1544 | u32 gpfifo_get = update_gp_get(ch->g, ch); |
1545 | |||
1545 | /* Count consequent timeout isr */ | 1546 | /* Count consequent timeout isr */ |
1546 | if (gpfifo_get == ch->timeout_gpfifo_get) { | 1547 | if (gpfifo_get == ch->timeout_gpfifo_get) { |
1547 | /* we didn't advance since previous channel timeout check */ | 1548 | /* we didn't advance since previous channel timeout check */ |
1548 | ch->timeout_accumulated_ms += timeout_delta_ms; | 1549 | ch->timeout_accumulated_ms += timeout_delta_ms; |
1550 | *progress = false; | ||
1549 | } else { | 1551 | } else { |
1550 | /* first timeout isr encountered */ | 1552 | /* first timeout isr encountered */ |
1551 | ch->timeout_accumulated_ms = timeout_delta_ms; | 1553 | ch->timeout_accumulated_ms = timeout_delta_ms; |
1554 | *progress = true; | ||
1552 | } | 1555 | } |
1553 | 1556 | ||
1554 | ch->timeout_gpfifo_get = gpfifo_get; | 1557 | ch->timeout_gpfifo_get = gpfifo_get; |
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h index 971175f2..6469603b 100644 --- a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h +++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h | |||
@@ -218,7 +218,7 @@ int gk20a_init_channel_support(struct gk20a *, u32 chid); | |||
218 | void gk20a_channel_close(struct channel_gk20a *ch); | 218 | void gk20a_channel_close(struct channel_gk20a *ch); |
219 | 219 | ||
220 | bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, | 220 | bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch, |
221 | u32 timeout_delta_ms); | 221 | u32 timeout_delta_ms, bool *progress); |
222 | void gk20a_disable_channel(struct channel_gk20a *ch); | 222 | void gk20a_disable_channel(struct channel_gk20a *ch); |
223 | void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt); | 223 | void gk20a_channel_abort(struct channel_gk20a *ch, bool channel_preempt); |
224 | void gk20a_channel_abort_clean_up(struct channel_gk20a *ch); | 224 | void gk20a_channel_abort_clean_up(struct channel_gk20a *ch); |
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index bd31656f..c18c7c94 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -1814,17 +1814,24 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, | |||
1814 | if (ctx_status == | 1814 | if (ctx_status == |
1815 | fifo_engine_status_ctx_status_ctxsw_load_v()) { | 1815 | fifo_engine_status_ctx_status_ctxsw_load_v()) { |
1816 | id = fifo_engine_status_next_id_v(status); | 1816 | id = fifo_engine_status_next_id_v(status); |
1817 | is_tsg = fifo_pbdma_status_id_type_v(status) | 1817 | is_tsg = fifo_engine_status_next_id_type_v(status) != |
1818 | != fifo_pbdma_status_id_type_chid_v(); | 1818 | fifo_engine_status_next_id_type_chid_v(); |
1819 | } else if (ctx_status == | 1819 | } else if (ctx_status == |
1820 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { | 1820 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { |
1821 | mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); | 1821 | mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); |
1822 | if (mailbox2 & FECS_METHOD_WFI_RESTORE) | 1822 | if (mailbox2 & FECS_METHOD_WFI_RESTORE) { |
1823 | id = fifo_engine_status_next_id_v(status); | 1823 | id = fifo_engine_status_next_id_v(status); |
1824 | else | 1824 | is_tsg = fifo_engine_status_next_id_type_v(status) != |
1825 | fifo_engine_status_next_id_type_chid_v(); | ||
1826 | } else { | ||
1825 | id = fifo_engine_status_id_v(status); | 1827 | id = fifo_engine_status_id_v(status); |
1828 | is_tsg = fifo_engine_status_id_type_v(status) != | ||
1829 | fifo_engine_status_id_type_chid_v(); | ||
1830 | } | ||
1826 | } else { | 1831 | } else { |
1827 | id = fifo_engine_status_id_v(status); | 1832 | id = fifo_engine_status_id_v(status); |
1833 | is_tsg = fifo_engine_status_id_type_v(status) != | ||
1834 | fifo_engine_status_id_type_chid_v(); | ||
1828 | } | 1835 | } |
1829 | break; | 1836 | break; |
1830 | } | 1837 | } |
@@ -1835,6 +1842,97 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, | |||
1835 | return active_engine_id; | 1842 | return active_engine_id; |
1836 | } | 1843 | } |
1837 | 1844 | ||
1845 | static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch, | ||
1846 | bool *verbose, u32 *ms) | ||
1847 | { | ||
1848 | bool recover = false; | ||
1849 | bool progress = false; | ||
1850 | |||
1851 | if (gk20a_channel_get(ch)) { | ||
1852 | recover = gk20a_channel_update_and_check_timeout(ch, | ||
1853 | GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000, | ||
1854 | &progress); | ||
1855 | *verbose = ch->timeout_debug_dump; | ||
1856 | *ms = ch->timeout_accumulated_ms; | ||
1857 | if (recover) | ||
1858 | gk20a_set_error_notifier(ch, | ||
1859 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1860 | |||
1861 | gk20a_channel_put(ch); | ||
1862 | } | ||
1863 | return recover; | ||
1864 | } | ||
1865 | |||
1866 | static bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg, | ||
1867 | bool *verbose, u32 *ms) | ||
1868 | { | ||
1869 | struct channel_gk20a *ch; | ||
1870 | bool recover = false; | ||
1871 | bool progress = false; | ||
1872 | |||
1873 | *verbose = false; | ||
1874 | *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; | ||
1875 | |||
1876 | mutex_lock(&tsg->ch_list_lock); | ||
1877 | |||
1878 | /* check if there was some progress on any of the TSG channels. | ||
1879 | * fifo recovery is needed if at least one channel reached the | ||
1880 | * maximum timeout without progress (update in gpfifo pointers). | ||
1881 | */ | ||
1882 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1883 | if (gk20a_channel_get(ch)) { | ||
1884 | recover = gk20a_channel_update_and_check_timeout(ch, | ||
1885 | *ms, &progress); | ||
1886 | if (progress || recover) | ||
1887 | break; | ||
1888 | gk20a_channel_put(ch); | ||
1889 | } | ||
1890 | } | ||
1891 | |||
1892 | /* if at least one channel in the TSG made some progress, reset | ||
1893 | * accumulated timeout for all channels in the TSG. In particular, | ||
1894 | * this resets timeout for channels that already completed their work | ||
1895 | */ | ||
1896 | if (progress) { | ||
1897 | gk20a_dbg_info("progress on tsg=%d ch=%d", | ||
1898 | tsg->tsgid, ch->hw_chid); | ||
1899 | gk20a_channel_put(ch); | ||
1900 | *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; | ||
1901 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1902 | if (gk20a_channel_get(ch)) { | ||
1903 | ch->timeout_accumulated_ms = *ms; | ||
1904 | gk20a_channel_put(ch); | ||
1905 | } | ||
1906 | } | ||
1907 | } | ||
1908 | |||
1909 | /* if one channel is presumed dead (no progress for too long), then | ||
1910 | * fifo recovery is needed. we can't really figure out which channel | ||
1911 | * caused the problem, so set timeout error notifier for all channels. | ||
1912 | */ | ||
1913 | if (recover) { | ||
1914 | gk20a_dbg_info("timeout on tsg=%d ch=%d", | ||
1915 | tsg->tsgid, ch->hw_chid); | ||
1916 | *ms = ch->timeout_accumulated_ms; | ||
1917 | gk20a_channel_put(ch); | ||
1918 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1919 | if (gk20a_channel_get(ch)) { | ||
1920 | gk20a_set_error_notifier(ch, | ||
1921 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1922 | *verbose |= ch->timeout_debug_dump; | ||
1923 | gk20a_channel_put(ch); | ||
1924 | } | ||
1925 | } | ||
1926 | } | ||
1927 | |||
1928 | /* if we could not detect progress on any of the channel, but none | ||
1929 | * of them has reached the timeout, there is nothing more to do: | ||
1930 | * timeout_accumulated_ms has been updated for all of them. | ||
1931 | */ | ||
1932 | mutex_unlock(&tsg->ch_list_lock); | ||
1933 | return recover; | ||
1934 | } | ||
1935 | |||
1838 | static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | 1936 | static bool gk20a_fifo_handle_sched_error(struct gk20a *g) |
1839 | { | 1937 | { |
1840 | u32 sched_error; | 1938 | u32 sched_error; |
@@ -1859,50 +1957,40 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1859 | if (fifo_intr_sched_error_code_f(sched_error) == | 1957 | if (fifo_intr_sched_error_code_f(sched_error) == |
1860 | fifo_intr_sched_error_code_ctxsw_timeout_v()) { | 1958 | fifo_intr_sched_error_code_ctxsw_timeout_v()) { |
1861 | struct fifo_gk20a *f = &g->fifo; | 1959 | struct fifo_gk20a *f = &g->fifo; |
1862 | struct channel_gk20a *ch = &f->channel[id]; | 1960 | u32 ms = 0; |
1961 | bool verbose = false; | ||
1863 | 1962 | ||
1864 | if (is_tsg) { | 1963 | if (is_tsg) { |
1865 | gk20a_channel_timeout_restart_all_channels(g); | 1964 | ret = gk20a_fifo_check_tsg_ctxsw_timeout( |
1866 | gk20a_fifo_recover(g, BIT(engine_id), id, true, | 1965 | &f->tsg[id], &verbose, &ms); |
1867 | true, true); | 1966 | } else { |
1868 | ret = true; | 1967 | ret = gk20a_fifo_check_ch_ctxsw_timeout( |
1869 | goto err; | 1968 | &f->channel[id], &verbose, &ms); |
1870 | } | 1969 | } |
1871 | 1970 | ||
1872 | if (!gk20a_channel_get(ch)) | 1971 | if (ret) { |
1873 | goto err; | ||
1874 | |||
1875 | if (gk20a_channel_update_and_check_timeout(ch, | ||
1876 | GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) { | ||
1877 | gk20a_set_error_notifier(ch, | ||
1878 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1879 | gk20a_err(dev_from_gk20a(g), | 1972 | gk20a_err(dev_from_gk20a(g), |
1880 | "fifo sched ctxsw timeout error:" | 1973 | "fifo sched ctxsw timeout error: " |
1881 | "engine = %u, ch = %d", engine_id, id); | 1974 | "engine=%u, %s=%d, ms=%u", |
1882 | gk20a_gr_debug_dump(g->dev); | 1975 | engine_id, is_tsg ? "tsg" : "ch", id, ms); |
1883 | /* | 1976 | /* |
1884 | * Cancel all channels' timeout since SCHED error might | 1977 | * Cancel all channels' timeout since SCHED error might |
1885 | * trigger multiple watchdogs at a time | 1978 | * trigger multiple watchdogs at a time |
1886 | */ | 1979 | */ |
1887 | gk20a_channel_timeout_restart_all_channels(g); | 1980 | gk20a_channel_timeout_restart_all_channels(g); |
1888 | gk20a_fifo_recover(g, BIT(engine_id), id, false, | 1981 | gk20a_fifo_recover(g, BIT(engine_id), id, |
1889 | true, ch->timeout_debug_dump); | 1982 | is_tsg, true, verbose); |
1890 | ret = true; | ||
1891 | } else { | 1983 | } else { |
1892 | gk20a_dbg_info( | 1984 | gk20a_dbg_info( |
1893 | "fifo is waiting for ctx switch for %d ms," | 1985 | "fifo is waiting for ctx switch for %d ms, " |
1894 | "ch = %d\n", | 1986 | "%s=%d", ms, is_tsg ? "tsg" : "ch", id); |
1895 | ch->timeout_accumulated_ms, | ||
1896 | id); | ||
1897 | ret = false; | ||
1898 | } | 1987 | } |
1899 | gk20a_channel_put(ch); | 1988 | } else { |
1900 | return ret; | 1989 | gk20a_err(dev_from_gk20a(g), |
1990 | "fifo sched error : 0x%08x, engine=%u, %s=%d", | ||
1991 | sched_error, engine_id, is_tsg ? "tsg" : "ch", id); | ||
1901 | } | 1992 | } |
1902 | 1993 | ||
1903 | gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d", | ||
1904 | sched_error, engine_id, is_tsg ? "tsg" : "ch", id); | ||
1905 | |||
1906 | err: | 1994 | err: |
1907 | return ret; | 1995 | return ret; |
1908 | } | 1996 | } |
@@ -1913,7 +2001,7 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr) | |||
1913 | struct device *dev = dev_from_gk20a(g); | 2001 | struct device *dev = dev_from_gk20a(g); |
1914 | u32 handled = 0; | 2002 | u32 handled = 0; |
1915 | 2003 | ||
1916 | gk20a_dbg_fn(""); | 2004 | gk20a_dbg_fn("fifo_intr=0x%08x", fifo_intr); |
1917 | 2005 | ||
1918 | if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { | 2006 | if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { |
1919 | /* pio mode is unused. this shouldn't happen, ever. */ | 2007 | /* pio mode is unused. this shouldn't happen, ever. */ |