summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
diff options
context:
space:
mode:
authorThomas Fleury <tfleury@nvidia.com>2016-08-17 20:26:30 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2016-08-29 19:14:29 -0400
commit5286fd525731d19dfa07d5e6e49e8d0eef233531 (patch)
tree6c64a7976d8e8795fb6bfda75cdfc7b058a4ecf8 /drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
parent06780e0681b34ec570346fe5d4bdaf7a23f08a36 (diff)
gpu: nvgpu: fix ctxsw timeout handling for TSGs
While collecting failing engine data, id type (is_tsg) was not set for ctxsw and save engine states. This could result in some ctxsw timeout interrupts to be ignored (id reported with wrong is_tsg). For TSGs, check if we made some progress on any of the channels before kicking fifo recovery. Bug 200228310 Jira EVLR-597 Change-Id: I231549ae68317919532de0f87effb78ee9c119c6 Signed-off-by: Thomas Fleury <tfleury@nvidia.com> Reviewed-on: http://git-master/r/1204035 (cherry picked from commit 7221d256fd7e9b418f7789b3d81eede8faa16f0b) Reviewed-on: http://git-master/r/1204037 Reviewed-by: Richard Zhao <rizhao@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
Diffstat (limited to 'drivers/gpu/nvgpu/gk20a/fifo_gk20a.c')
-rw-r--r--drivers/gpu/nvgpu/gk20a/fifo_gk20a.c156
1 files changed, 122 insertions, 34 deletions
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
index bd31656f..c18c7c94 100644
--- a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -1814,17 +1814,24 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
1814 if (ctx_status == 1814 if (ctx_status ==
1815 fifo_engine_status_ctx_status_ctxsw_load_v()) { 1815 fifo_engine_status_ctx_status_ctxsw_load_v()) {
1816 id = fifo_engine_status_next_id_v(status); 1816 id = fifo_engine_status_next_id_v(status);
1817 is_tsg = fifo_pbdma_status_id_type_v(status) 1817 is_tsg = fifo_engine_status_next_id_type_v(status) !=
1818 != fifo_pbdma_status_id_type_chid_v(); 1818 fifo_engine_status_next_id_type_chid_v();
1819 } else if (ctx_status == 1819 } else if (ctx_status ==
1820 fifo_engine_status_ctx_status_ctxsw_switch_v()) { 1820 fifo_engine_status_ctx_status_ctxsw_switch_v()) {
1821 mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2)); 1821 mailbox2 = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(2));
1822 if (mailbox2 & FECS_METHOD_WFI_RESTORE) 1822 if (mailbox2 & FECS_METHOD_WFI_RESTORE) {
1823 id = fifo_engine_status_next_id_v(status); 1823 id = fifo_engine_status_next_id_v(status);
1824 else 1824 is_tsg = fifo_engine_status_next_id_type_v(status) !=
1825 fifo_engine_status_next_id_type_chid_v();
1826 } else {
1825 id = fifo_engine_status_id_v(status); 1827 id = fifo_engine_status_id_v(status);
1828 is_tsg = fifo_engine_status_id_type_v(status) !=
1829 fifo_engine_status_id_type_chid_v();
1830 }
1826 } else { 1831 } else {
1827 id = fifo_engine_status_id_v(status); 1832 id = fifo_engine_status_id_v(status);
1833 is_tsg = fifo_engine_status_id_type_v(status) !=
1834 fifo_engine_status_id_type_chid_v();
1828 } 1835 }
1829 break; 1836 break;
1830 } 1837 }
@@ -1835,6 +1842,97 @@ u32 gk20a_fifo_get_failing_engine_data(struct gk20a *g,
1835 return active_engine_id; 1842 return active_engine_id;
1836} 1843}
1837 1844
1845static bool gk20a_fifo_check_ch_ctxsw_timeout(struct channel_gk20a *ch,
1846 bool *verbose, u32 *ms)
1847{
1848 bool recover = false;
1849 bool progress = false;
1850
1851 if (gk20a_channel_get(ch)) {
1852 recover = gk20a_channel_update_and_check_timeout(ch,
1853 GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000,
1854 &progress);
1855 *verbose = ch->timeout_debug_dump;
1856 *ms = ch->timeout_accumulated_ms;
1857 if (recover)
1858 gk20a_set_error_notifier(ch,
1859 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
1860
1861 gk20a_channel_put(ch);
1862 }
1863 return recover;
1864}
1865
1866static bool gk20a_fifo_check_tsg_ctxsw_timeout(struct tsg_gk20a *tsg,
1867 bool *verbose, u32 *ms)
1868{
1869 struct channel_gk20a *ch;
1870 bool recover = false;
1871 bool progress = false;
1872
1873 *verbose = false;
1874 *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000;
1875
1876 mutex_lock(&tsg->ch_list_lock);
1877
1878 /* check if there was some progress on any of the TSG channels.
1879 * fifo recovery is needed if at least one channel reached the
1880 * maximum timeout without progress (update in gpfifo pointers).
1881 */
1882 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
1883 if (gk20a_channel_get(ch)) {
1884 recover = gk20a_channel_update_and_check_timeout(ch,
1885 *ms, &progress);
1886 if (progress || recover)
1887 break;
1888 gk20a_channel_put(ch);
1889 }
1890 }
1891
1892 /* if at least one channel in the TSG made some progress, reset
1893 * accumulated timeout for all channels in the TSG. In particular,
1894 * this resets timeout for channels that already completed their work
1895 */
1896 if (progress) {
1897 gk20a_dbg_info("progress on tsg=%d ch=%d",
1898 tsg->tsgid, ch->hw_chid);
1899 gk20a_channel_put(ch);
1900 *ms = GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000;
1901 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
1902 if (gk20a_channel_get(ch)) {
1903 ch->timeout_accumulated_ms = *ms;
1904 gk20a_channel_put(ch);
1905 }
1906 }
1907 }
1908
1909 /* if one channel is presumed dead (no progress for too long), then
1910 * fifo recovery is needed. we can't really figure out which channel
1911 * caused the problem, so set timeout error notifier for all channels.
1912 */
1913 if (recover) {
1914 gk20a_dbg_info("timeout on tsg=%d ch=%d",
1915 tsg->tsgid, ch->hw_chid);
1916 *ms = ch->timeout_accumulated_ms;
1917 gk20a_channel_put(ch);
1918 list_for_each_entry(ch, &tsg->ch_list, ch_entry) {
1919 if (gk20a_channel_get(ch)) {
1920 gk20a_set_error_notifier(ch,
1921 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
1922 *verbose |= ch->timeout_debug_dump;
1923 gk20a_channel_put(ch);
1924 }
1925 }
1926 }
1927
1928 /* if we could not detect progress on any of the channel, but none
1929 * of them has reached the timeout, there is nothing more to do:
1930 * timeout_accumulated_ms has been updated for all of them.
1931 */
1932 mutex_unlock(&tsg->ch_list_lock);
1933 return recover;
1934}
1935
1838static bool gk20a_fifo_handle_sched_error(struct gk20a *g) 1936static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
1839{ 1937{
1840 u32 sched_error; 1938 u32 sched_error;
@@ -1859,50 +1957,40 @@ static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
1859 if (fifo_intr_sched_error_code_f(sched_error) == 1957 if (fifo_intr_sched_error_code_f(sched_error) ==
1860 fifo_intr_sched_error_code_ctxsw_timeout_v()) { 1958 fifo_intr_sched_error_code_ctxsw_timeout_v()) {
1861 struct fifo_gk20a *f = &g->fifo; 1959 struct fifo_gk20a *f = &g->fifo;
1862 struct channel_gk20a *ch = &f->channel[id]; 1960 u32 ms = 0;
1961 bool verbose = false;
1863 1962
1864 if (is_tsg) { 1963 if (is_tsg) {
1865 gk20a_channel_timeout_restart_all_channels(g); 1964 ret = gk20a_fifo_check_tsg_ctxsw_timeout(
1866 gk20a_fifo_recover(g, BIT(engine_id), id, true, 1965 &f->tsg[id], &verbose, &ms);
1867 true, true); 1966 } else {
1868 ret = true; 1967 ret = gk20a_fifo_check_ch_ctxsw_timeout(
1869 goto err; 1968 &f->channel[id], &verbose, &ms);
1870 } 1969 }
1871 1970
1872 if (!gk20a_channel_get(ch)) 1971 if (ret) {
1873 goto err;
1874
1875 if (gk20a_channel_update_and_check_timeout(ch,
1876 GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
1877 gk20a_set_error_notifier(ch,
1878 NVGPU_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
1879 gk20a_err(dev_from_gk20a(g), 1972 gk20a_err(dev_from_gk20a(g),
1880 "fifo sched ctxsw timeout error:" 1973 "fifo sched ctxsw timeout error: "
1881 "engine = %u, ch = %d", engine_id, id); 1974 "engine=%u, %s=%d, ms=%u",
1882 gk20a_gr_debug_dump(g->dev); 1975 engine_id, is_tsg ? "tsg" : "ch", id, ms);
1883 /* 1976 /*
1884 * Cancel all channels' timeout since SCHED error might 1977 * Cancel all channels' timeout since SCHED error might
1885 * trigger multiple watchdogs at a time 1978 * trigger multiple watchdogs at a time
1886 */ 1979 */
1887 gk20a_channel_timeout_restart_all_channels(g); 1980 gk20a_channel_timeout_restart_all_channels(g);
1888 gk20a_fifo_recover(g, BIT(engine_id), id, false, 1981 gk20a_fifo_recover(g, BIT(engine_id), id,
1889 true, ch->timeout_debug_dump); 1982 is_tsg, true, verbose);
1890 ret = true;
1891 } else { 1983 } else {
1892 gk20a_dbg_info( 1984 gk20a_dbg_info(
1893 "fifo is waiting for ctx switch for %d ms," 1985 "fifo is waiting for ctx switch for %d ms, "
1894 "ch = %d\n", 1986 "%s=%d", ms, is_tsg ? "tsg" : "ch", id);
1895 ch->timeout_accumulated_ms,
1896 id);
1897 ret = false;
1898 } 1987 }
1899 gk20a_channel_put(ch); 1988 } else {
1900 return ret; 1989 gk20a_err(dev_from_gk20a(g),
1990 "fifo sched error : 0x%08x, engine=%u, %s=%d",
1991 sched_error, engine_id, is_tsg ? "tsg" : "ch", id);
1901 } 1992 }
1902 1993
1903 gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d",
1904 sched_error, engine_id, is_tsg ? "tsg" : "ch", id);
1905
1906err: 1994err:
1907 return ret; 1995 return ret;
1908} 1996}
@@ -1913,7 +2001,7 @@ static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
1913 struct device *dev = dev_from_gk20a(g); 2001 struct device *dev = dev_from_gk20a(g);
1914 u32 handled = 0; 2002 u32 handled = 0;
1915 2003
1916 gk20a_dbg_fn(""); 2004 gk20a_dbg_fn("fifo_intr=0x%08x", fifo_intr);
1917 2005
1918 if (fifo_intr & fifo_intr_0_pio_error_pending_f()) { 2006 if (fifo_intr & fifo_intr_0_pio_error_pending_f()) {
1919 /* pio mode is unused. this shouldn't happen, ever. */ 2007 /* pio mode is unused. this shouldn't happen, ever. */