diff options
author | Thomas Fleury <tfleury@nvidia.com> | 2016-08-17 20:26:30 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2016-08-29 19:14:29 -0400 |
commit | 5286fd525731d19dfa07d5e6e49e8d0eef233531 (patch) | |
tree | 6c64a7976d8e8795fb6bfda75cdfc7b058a4ecf8 /drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |
parent | 06780e0681b34ec570346fe5d4bdaf7a23f08a36 (diff) |
gpu: nvgpu: fix ctxsw timeout handling for TSGs
While collecting failing engine data, id type (is_tsg) was not
set for ctxsw and save engine states. This could result in some
ctxsw timeout interrupts to be ignored (id reported with wrong
is_tsg).
For TSGs, check if we made some progress on any of the channels
before kicking fifo recovery.
Bug 200228310
Jira EVLR-597
Change-Id: I231549ae68317919532de0f87effb78ee9c119c6
Signed-off-by: Thomas Fleury <tfleury@nvidia.com>
Reviewed-on: http://git-master/r/1204035
(cherry picked from commit 7221d256fd7e9b418f7789b3d81eede8faa16f0b)
Reviewed-on: http://git-master/r/1204037
Reviewed-by: Richard Zhao <rizhao@nvidia.com>
GVS: Gerrit_Virtual_Submit
Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r-- | drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | 156 |
1 files changed, 122 insertions, 34 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c index bd31656f..c18c7c94 100644 --- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c +++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c | |||
@@ -1814,17 +1814,24 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, | |||
1814 | if (ctx_status == | 1814 | if (ctx_status == |
1815 | fifo_engine_status_ctx_status_ctxsw_load_v()) { | 1815 | fifo_engine_status_ctx_status_ctxsw_load_v()) { |
1816 | id = fifo_engine_status_next_id_v(status); | 1816 | id = fifo_engine_status_next_id_v(status); |
1817 | is_tsg = fifo_pbdma_status_id_type_v(status) | 1817 | is_tsg = fifo_engine_status_next_id_type_v(status) != |
1818 | != fifo_pbdma_status_id_type_chid_v(); | 1818 | fifo_engine_status_next_id_type_chid_v(); |
1819 | } else if (ctx_status == | 1819 | } else if (ctx_status == |
1820 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { | 1820 | fifo_engine_status_ctx_status_ctxsw_switch_v()) { |
1821 | mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); | 1821 | mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); |
1822 | if (mailbox2 & FECS_METHOD_WFI_RESTORE) | 1822 | if (mailbox2 & FECS_METHOD_WFI_RESTORE) { |
1823 | id = fifo_engine_status_next_id_v(status); | 1823 | id = fifo_engine_status_next_id_v(status); |
1824 | else | 1824 | is_tsg = fifo_engine_status_next_id_type_v(status) != |
1825 | fifo_engine_status_next_id_type_chid_v(); | ||
1826 | } else { | ||
1825 | id = fifo_engine_status_id_v(status); | 1827 | id = fifo_engine_status_id_v(status); |
1828 | is_tsg = fifo_engine_status_id_type_v(status) != | ||
1829 | fifo_engine_status_id_type_chid_v(); | ||
1830 | } | ||
1826 | } else { | 1831 | } else { |
1827 | id = fifo_engine_status_id_v(status); | 1832 | id = fifo_engine_status_id_v(status); |
1833 | is_tsg = fifo_engine_status_id_type_v(status) != | ||
1834 | fifo_engine_status_id_type_chid_v(); | ||
1828 | } | 1835 | } |
1829 | break; | 1836 | break; |
1830 | } | 1837 | } |
@@ -1835,6 +1842,97 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g, | |||
1835 | return active_engine_id; | 1842 | return active_engine_id; |
1836 | } | 1843 | } |
1837 | 1844 | ||
1845 | static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch, | ||
1846 | bool *verbose, u32 *ms) | ||
1847 | { | ||
1848 | bool recover = false; | ||
1849 | bool progress = false; | ||
1850 | |||
1851 | if (gk20a_channel_get(ch)) { | ||
1852 | recover = gk20a_channel_update_and_check_timeout(ch, | ||
1853 | GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000, | ||
1854 | &progress); | ||
1855 | *verbose = ch->timeout_debug_dump; | ||
1856 | *ms = ch->timeout_accumulated_ms; | ||
1857 | if (recover) | ||
1858 | gk20a_set_error_notifier(ch, | ||
1859 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1860 | |||
1861 | gk20a_channel_put(ch); | ||
1862 | } | ||
1863 | return recover; | ||
1864 | } | ||
1865 | |||
1866 | static bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg, | ||
1867 | bool *verbose, u32 *ms) | ||
1868 | { | ||
1869 | struct channel_gk20a *ch; | ||
1870 | bool recover = false; | ||
1871 | bool progress = false; | ||
1872 | |||
1873 | *verbose = false; | ||
1874 | *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; | ||
1875 | |||
1876 | mutex_lock(&tsg->ch_list_lock); | ||
1877 | |||
1878 | /* check if there was some progress on any of the TSG channels. | ||
1879 | * fifo recovery is needed if at least one channel reached the | ||
1880 | * maximum timeout without progress (update in gpfifo pointers). | ||
1881 | */ | ||
1882 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1883 | if (gk20a_channel_get(ch)) { | ||
1884 | recover = gk20a_channel_update_and_check_timeout(ch, | ||
1885 | *ms, &progress); | ||
1886 | if (progress || recover) | ||
1887 | break; | ||
1888 | gk20a_channel_put(ch); | ||
1889 | } | ||
1890 | } | ||
1891 | |||
1892 | /* if at least one channel in the TSG made some progress, reset | ||
1893 | * accumulated timeout for all channels in the TSG. In particular, | ||
1894 | * this resets timeout for channels that already completed their work | ||
1895 | */ | ||
1896 | if (progress) { | ||
1897 | gk20a_dbg_info("progress on tsg=%d ch=%d", | ||
1898 | tsg->tsgid, ch->hw_chid); | ||
1899 | gk20a_channel_put(ch); | ||
1900 | *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000; | ||
1901 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1902 | if (gk20a_channel_get(ch)) { | ||
1903 | ch->timeout_accumulated_ms = *ms; | ||
1904 | gk20a_channel_put(ch); | ||
1905 | } | ||
1906 | } | ||
1907 | } | ||
1908 | |||
1909 | /* if one channel is presumed dead (no progress for too long), then | ||
1910 | * fifo recovery is needed. we can't really figure out which channel | ||
1911 | * caused the problem, so set timeout error notifier for all channels. | ||
1912 | */ | ||
1913 | if (recover) { | ||
1914 | gk20a_dbg_info("timeout on tsg=%d ch=%d", | ||
1915 | tsg->tsgid, ch->hw_chid); | ||
1916 | *ms = ch->timeout_accumulated_ms; | ||
1917 | gk20a_channel_put(ch); | ||
1918 | list_for_each_entry(ch, &tsg->ch_list, ch_entry) { | ||
1919 | if (gk20a_channel_get(ch)) { | ||
1920 | gk20a_set_error_notifier(ch, | ||
1921 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1922 | *verbose |= ch->timeout_debug_dump; | ||
1923 | gk20a_channel_put(ch); | ||
1924 | } | ||
1925 | } | ||
1926 | } | ||
1927 | |||
1928 | /* if we could not detect progress on any of the channel, but none | ||
1929 | * of them has reached the timeout, there is nothing more to do: | ||
1930 | * timeout_accumulated_ms has been updated for all of them. | ||
1931 | */ | ||
1932 | mutex_unlock(&tsg->ch_list_lock); | ||
1933 | return recover; | ||
1934 | } | ||
1935 | |||
1838 | static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | 1936 | static bool gk20a_fifo_handle_sched_error(struct gk20a *g) |
1839 | { | 1937 | { |
1840 | u32 sched_error; | 1938 | u32 sched_error; |
@@ -1859,50 +1957,40 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g) | |||
1859 | if (fifo_intr_sched_error_code_f(sched_error) == | 1957 | if (fifo_intr_sched_error_code_f(sched_error) == |
1860 | fifo_intr_sched_error_code_ctxsw_timeout_v()) { | 1958 | fifo_intr_sched_error_code_ctxsw_timeout_v()) { |
1861 | struct fifo_gk20a *f = &g->fifo; | 1959 | struct fifo_gk20a *f = &g->fifo; |
1862 | struct channel_gk20a *ch = &f->channel[id]; | 1960 | u32 ms = 0; |
1961 | bool verbose = false; | ||
1863 | 1962 | ||
1864 | if (is_tsg) { | 1963 | if (is_tsg) { |
1865 | gk20a_channel_timeout_restart_all_channels(g); | 1964 | ret = gk20a_fifo_check_tsg_ctxsw_timeout( |
1866 | gk20a_fifo_recover(g, BIT(engine_id), id, true, | 1965 | &f->tsg[id], &verbose, &ms); |
1867 | true, true); | 1966 | } else { |
1868 | ret = true; | 1967 | ret = gk20a_fifo_check_ch_ctxsw_timeout( |
1869 | goto err; | 1968 | &f->channel[id], &verbose, &ms); |
1870 | } | 1969 | } |
1871 | 1970 | ||
1872 | if (!gk20a_channel_get(ch)) | 1971 | if (ret) { |
1873 | goto err; | ||
1874 | |||
1875 | if (gk20a_channel_update_and_check_timeout(ch, | ||
1876 | GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) { | ||
1877 | gk20a_set_error_notifier(ch, | ||
1878 | NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT); | ||
1879 | gk20a_err(dev_from_gk20a(g), | 1972 | gk20a_err(dev_from_gk20a(g), |
1880 | "fifo sched ctxsw timeout error:" | 1973 | "fifo sched ctxsw timeout error: " |
1881 | "engine = %u, ch = %d", engine_id, id); | 1974 | "engine=%u, %s=%d, ms=%u", |
1882 | gk20a_gr_debug_dump(g->dev); | 1975 | engine_id, is_tsg ? "tsg" : "ch", id, ms); |
1883 | /* | 1976 | /* |
1884 | * Cancel all channels' timeout since SCHED error might | 1977 | * Cancel all channels' timeout since SCHED error might |
1885 | * trigger multiple watchdogs at a time | 1978 | * trigger multiple watchdogs at a time |
1886 | */ | 1979 | */ |
1887 | gk20a_channel_timeout_restart_all_channels(g); | 1980 | gk20a_channel_timeout_restart_all_channels(g); |
1888 | gk20a_fifo_recover(g, BIT(engine_id), id, false, | 1981 | gk20a_fifo_recover(g, BIT(engine_id), id, |
1889 | true, ch->timeout_debug_dump); | 1982 | is_tsg, true, verbose); |
1890 | ret = true; | ||
1891 | } else { | 1983 | } else { |
1892 | gk20a_dbg_info( | 1984 | gk20a_dbg_info( |
1893 | "fifo is waiting for ctx switch for %d ms," | 1985 | "fifo is waiting for ctx switch for %d ms, " |
1894 | "ch = %d\n", | 1986 | "%s=%d", ms, is_tsg ? "tsg" : "ch", id); |
1895 | ch->timeout_accumulated_ms, | ||
1896 | id); | ||
1897 | ret = false; | ||
1898 | } | 1987 | } |
1899 | gk20a_channel_put(ch); | 1988 | } else { |
1900 | return ret; | 1989 | gk20a_err(dev_from_gk20a(g), |
1990 | "fifo sched error : 0x%08x, engine=%u, %s=%d", | ||
1991 | sched_error, engine_id, is_tsg ? "tsg" : "ch", id); | ||
1901 | } | 1992 | } |
1902 | 1993 | ||
1903 | gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d", | ||
1904 | sched_error, engine_id, is_tsg ? "tsg" : "ch", id); | ||
1905 | |||
1906 | err: | 1994 | err: |
1907 | return ret; | 1995 | return ret; |
1908 | } | 1996 | } |
@@ -1913,7 +2001,7 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr) | |||
1913 | struct device *dev = dev_from_gk20a(g); | 2001 | struct device *dev = dev_from_gk20a(g); |
1914 | u32 handled = 0; | 2002 | u32 handled = 0; |
1915 | 2003 | ||
1916 | gk20a_dbg_fn(""); | 2004 | gk20a_dbg_fn("fifo_intr=0x%08x", fifo_intr); |
1917 | 2005 | ||
1918 | if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { | 2006 | if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { |
1919 | /* pio mode is unused. this shouldn't happen, ever. */ | 2007 | /* pio mode is unused. this shouldn't happen, ever. */ |