console UTF-8 fixes

The UTF-8 part of the vt driver suffers from the following issues which are addressed in my patch: 1) If there's no glyph found for a particular valid UTF-8 character, we try to display U+FFFD. However if this one is not found either, here's what the current kernel does: - First, if the Unicode value is less than the number of glyphs, use the glyph directly from that position of the glyph table. While it may be a good idea in the 8-bit world, it has absolutely no sense with Unicode in mind. For example, if a Latin-2 font is loaded and an application prints U+00FB ("u with circumflex", not present in Latin-2) then as a fallback solution the glyph from the 0xFB position of the Latin-2 fontset (which is an "u with double accent" - a different character) is displayed. - Second, if this fallback fails too, a simple ASCII question mark is printed, which is visually undistinguishable from a real question mark. I changed the code to skip the first step (except if in non-UTF-8 mode), and changed the second step to print the question mark with inverse color attributes, so it is visually clear that it's not a real question mark, and resembles more to the common glyph of U+FFFD. 2) The UTF-8 decoder is buggy in many ways: - Lone continuation bytes (section 3.1 of Markus Kuhn's UTF-8 stress test) are not caught, they are displayed as some "random" (taken directly form the font table, see above) glyphs instead the replacement character. - Incomplete sequences (sections 3.2 and 3.3 of the stress test) emit no replacement character, but rather cause the subsequent valid character to be displayed more times(!). - The decoder is not safe: overlong sequences are not caught currently, they are displayed as if these were valid representations. This may even have security impacts. - The decoder does not handle D800..DFFF and FFFE..FFFF specially, it just emits these code points and lets it be looked up in the glyph table. Since these are invalid code points, I replace them by U+FFFD and hence give no chance for them to be looked up in the glyph table. (Assuming no font ships glyphs for these code points, this change is not visible to the users since the glyph shown will be the same.) With my fixes to the decoder it now behaves exactly as Markus Kuhn's stress test recommends. 3) It has no concept of double-width (CJK) characters. It's way beyond the scope of my patch to try to display them, but at least I think it's important for the cursor to jump two positions when printing such characters, since this is what applications (such as text editors) expect. Currently the cursor only jumps one position, and hence applications suffer from displaying and refreshing problems, and editing some English letters that are preceded by some CJK characters in the same line is a nightmare. With my patch an additional space is inserted after the CJK character has been printed (which usually means a replacement symbol of course). (If U+FFFD isn't availble and hence an inverse question mark is displayed in the first cell, I keep the inverted state for the space in the 2nd column so it's quite easy to see that they are tied together.) 4) There is a small built-in table of zero-width spaces that are not to be printed but silently skipped. U+200A is included there, but it's not a zero-width character, so I remove it from there. Signed-off-by: Egmont Koblinger <egmont@uhulinux.hu> Cc: Jan Engelhardt <jengelh@linux01.gwdg.de> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Antonino A. Daplas" <adaplas@pol.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Egmont Koblinger <egmont@uhulinux.hu> 2007-05-08 03:30:37 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-08 14:15:12 -0400
commit: 2f1a2ccb9c0de632ab07193becf5f7121794f6ae (patch)
tree: 8fffd5aa34634ad6809e05e0dcbeebec5d039d3f /drivers/char/vt.c
parent: e659ba4a0d2d471c0d73590f78e1a1b5a1eede48 (diff)
1 files changed, 179 insertions, 78 deletions
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 1bbb45b937fd..afd00464184e 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -1932,6 +1932,46 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c)
 char con_buf[CON_BUF_SIZE];
 DECLARE_MUTEX(con_buf_sem);
+/* is_double_width() is based on the wcwidth() implementation by
+ * Markus Kuhn -- 2003-05-20 (Unicode 4.0)
+ * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
+ */
+struct interval {
+        uint32_t first;
+        uint32_t last;
+};
+static int bisearch(uint32_t ucs, const struct interval *table, int max)
+{
+        int min = 0;
+        int mid;
+        if (ucs < table[0].first || ucs > table[max].last)
+                return 0;
+        while (max >= min) {
+                mid = (min + max) / 2;
+                if (ucs > table[mid].last)
+                        min = mid + 1;
+                else if (ucs < table[mid].first)
+                        max = mid - 1;
+                else
+                        return 1;
+        }
+        return 0;
+}
+static int is_double_width(uint32_t ucs)
+{
+        static const struct interval double_width[] = {
+                { 0x1100, 0x115F }, { 0x2329, 0x232A }, { 0x2E80, 0x303E },
+                { 0x3040, 0xA4CF }, { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF },
+                { 0xFE30, 0xFE6F }, { 0xFF00, 0xFF60 }, { 0xFFE0, 0xFFE6 },
+                { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }
+        };
+        return bisearch(ucs, double_width,
+                sizeof(double_width) / sizeof(*double_width) - 1);
+}
 /* acquires console_sem */
 static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count)
 {
@@ -1948,6 +1988,10 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
        unsigned int currcons;
        unsigned long draw_from = 0, draw_to = 0;
        struct vc_data *vc;
+        unsigned char vc_attr;
+        uint8_t rescan;
+        uint8_t inverse;
+        uint8_t width;
        u16 himask, charmask;
        const unsigned char *orig_buf = NULL;
        int orig_count;
@@ -2010,53 +2054,86 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
                buf++;
                n++;
                count--;
+                rescan = 0;
+                inverse = 0;
+                width = 1;
                /* Do no translation at all in control states */
                if (vc->vc_state != ESnormal) {
                        tc = c;
                } else if (vc->vc_utf && !vc->vc_disp_ctrl) {
-                    /* Combine UTF-8 into Unicode */
+                    /* Combine UTF-8 into Unicode in vc_utf_char.
-                    /* Malformed sequences as sequences of replacement glyphs */
+                     * vc_utf_count is the number of continuation bytes still
+                     * expected to arrive.
+                     * vc_npar is the number of continuation bytes arrived so
+                     * far
+                     */
 rescan_last_byte:
-                    if(c > 0x7f) {
+                    if ((c & 0xc0) == 0x80) {
+                        /* Continuation byte received */
+                        static const uint32_t utf8_length_changes[] = { 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff };
                        if (vc->vc_utf_count) {
-                               if ((c & 0xc0) == 0x80) {
+                            vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
-                                       vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
+                            vc->vc_npar++;
-                                       if (--vc->vc_utf_count) {
+                            if (--vc->vc_utf_count) {
-                                               vc->vc_npar++;
+                                /* Still need some bytes */
-                                               continue;
-                                       }
-                                       tc = c = vc->vc_utf_char;
-                               } else
-                                       goto replacement_glyph;
-                        } else {
-                                vc->vc_npar = 0;
-                                if ((c & 0xe0) == 0xc0) {
-                                    vc->vc_utf_count = 1;
-                                    vc->vc_utf_char = (c & 0x1f);
-                                } else if ((c & 0xf0) == 0xe0) {
-                                    vc->vc_utf_count = 2;
-                                    vc->vc_utf_char = (c & 0x0f);
-                                } else if ((c & 0xf8) == 0xf0) {
-                                    vc->vc_utf_count = 3;
-                                    vc->vc_utf_char = (c & 0x07);
-                                } else if ((c & 0xfc) == 0xf8) {
-                                    vc->vc_utf_count = 4;
-                                    vc->vc_utf_char = (c & 0x03);
-                                } else if ((c & 0xfe) == 0xfc) {
-                                    vc->vc_utf_count = 5;
-                                    vc->vc_utf_char = (c & 0x01);
-                                } else
-                                    goto replacement_glyph;
                                continue;
-                              }
+                            }
+                            /* Got a whole character */
+                            c = vc->vc_utf_char;
+                            /* Reject overlong sequences */
+                            if (c <= utf8_length_changes[vc->vc_npar - 1] ||
+                                        c > utf8_length_changes[vc->vc_npar])
+                                c = 0xfffd;
+                        } else {
+                            /* Unexpected continuation byte */
+                            vc->vc_utf_count = 0;
+                            c = 0xfffd;
+                        }
                    } else {
-                      if (vc->vc_utf_count)
+                        /* Single ASCII byte or first byte of a sequence received */
-                              goto replacement_glyph;
+                        if (vc->vc_utf_count) {
-                      tc = c;
+                            /* Continuation byte expected */
+                            rescan = 1;
+                            vc->vc_utf_count = 0;
+                            c = 0xfffd;
+                        } else if (c > 0x7f) {
+                            /* First byte of a multibyte sequence received */
+                            vc->vc_npar = 0;
+                            if ((c & 0xe0) == 0xc0) {
+                                vc->vc_utf_count = 1;
+                                vc->vc_utf_char = (c & 0x1f);
+                            } else if ((c & 0xf0) == 0xe0) {
+                                vc->vc_utf_count = 2;
+                                vc->vc_utf_char = (c & 0x0f);
+                            } else if ((c & 0xf8) == 0xf0) {
+                                vc->vc_utf_count = 3;
+                                vc->vc_utf_char = (c & 0x07);
+                            } else if ((c & 0xfc) == 0xf8) {
+                                vc->vc_utf_count = 4;
+                                vc->vc_utf_char = (c & 0x03);
+                            } else if ((c & 0xfe) == 0xfc) {
+                                vc->vc_utf_count = 5;
+                                vc->vc_utf_char = (c & 0x01);
+                            } else {
+                                /* 254 and 255 are invalid */
+                                c = 0xfffd;
+                            }
+                            if (vc->vc_utf_count) {
+                                /* Still need some bytes */
+                                continue;
+                            }
+                        }
+                        /* Nothing to do if an ASCII byte was received */
                    }
+                    /* End of UTF-8 decoding. */
+                    /* c is the received character, or U+FFFD for invalid sequences. */
+                    /* Replace invalid Unicode code points with U+FFFD too */
+                    if ((c >= 0xd800 && c <= 0xdfff) || c == 0xfffe || c == 0xffff)
+                        c = 0xfffd;
+                    tc = c;
                } else {        /* no utf or alternate charset mode */
-                  tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
+                    tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
                }
                /* If the original code was a control character we
@@ -2076,56 +2153,80 @@ rescan_last_byte:
                        && (c != 128+27);
                if (vc->vc_state == ESnormal && ok) {
+                        if (vc->vc_utf && !vc->vc_disp_ctrl) {
+                                if (is_double_width(c))
+                                        width = 2;
+                        }
                        /* Now try to find out how to display it */
                        tc = conv_uni_to_pc(vc, tc);
                        if (tc & ~charmask) {
-                                if ( tc == -4 ) {
+                                if (tc == -1 || tc == -2) {
-                                /* If we got -4 (not found) then see if we have
+                                    continue; /* nothing to display */
-                                   defined a replacement character (U+FFFD) */
+                                }
-replacement_glyph:
+                                /* Glyph not found */
-                                        tc = conv_uni_to_pc(vc, 0xfffd);
+                                if (!(vc->vc_utf && !vc->vc_disp_ctrl) && !(c & ~charmask)) {
-                                        if (!(tc & ~charmask))
+                                    /* In legacy mode use the glyph we get by a 1:1 mapping.
-                                                goto display_glyph;
+                                       This would make absolutely no sense with Unicode in mind. */
-                                } else if ( tc != -3 )
+                                    tc = c;
-                                        continue; /* nothing to display */
+                                } else {
-                                /* no hash table or no replacement --
+                                    /* Display U+FFFD. If it's not found, display an inverse question mark. */
-                                 * hope for the best */
+                                    tc = conv_uni_to_pc(vc, 0xfffd);
-                                if ( c & ~charmask )
+                                    if (tc < 0) {
-                                        tc = '?';
+                                        inverse = 1;
-                                else
+                                        tc = conv_uni_to_pc(vc, '?');
-                                        tc = c;
+                                        if (tc < 0) tc = '?';
+                                    }
+                                }
                        }
-display_glyph:
+                        if (!inverse) {
-                        if (vc->vc_need_wrap || vc->vc_decim)
+                                vc_attr = vc->vc_attr;
-                                FLUSH
-                        if (vc->vc_need_wrap) {
-                                cr(vc);
-                                lf(vc);
-                        }
-                        if (vc->vc_decim)
-                                insert_char(vc, 1);
-                        scr_writew(himask ?
-                                     ((vc->vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
-                                     (vc->vc_attr << 8) + tc,
-                                   (u16 *) vc->vc_pos);
-                        if (DO_UPDATE(vc) && draw_x < 0) {
-                                draw_x = vc->vc_x;
-                                draw_from = vc->vc_pos;
-                        }
-                        if (vc->vc_x == vc->vc_cols - 1) {
-                                vc->vc_need_wrap = vc->vc_decawm;
-                                draw_to = vc->vc_pos + 2;
                        } else {
-                                vc->vc_x++;
+                                /* invert vc_attr */
-                                draw_to = (vc->vc_pos += 2);
+                                if (!vc->vc_can_do_color) {
+                                        vc_attr = (vc->vc_attr) ^ 0x08;
+                                } else if (vc->vc_hi_font_mask == 0x100) {
+                                        vc_attr = ((vc->vc_attr) & 0x11) | (((vc->vc_attr) & 0xe0) >> 4) | (((vc->vc_attr) & 0x0e) << 4);
+                                } else {
+                                        vc_attr = ((vc->vc_attr) & 0x88) | (((vc->vc_attr) & 0x70) >> 4) | (((vc->vc_attr) & 0x07) << 4);
+                                }
                        }
-                        if (vc->vc_utf_count) {
-                                if (vc->vc_npar) {
+                        while (1) {
-                                        vc->vc_npar--;
+                                if (vc->vc_need_wrap || vc->vc_decim)
-                                        goto display_glyph;
+                                        FLUSH
+                                if (vc->vc_need_wrap) {
+                                        cr(vc);
+                                        lf(vc);
+                                }
+                                if (vc->vc_decim)
+                                        insert_char(vc, 1);
+                                scr_writew(himask ?
+                                             ((vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
+                                             (vc_attr << 8) + tc,
+                                           (u16 *) vc->vc_pos);
+                                if (DO_UPDATE(vc) && draw_x < 0) {
+                                        draw_x = vc->vc_x;
+                                        draw_from = vc->vc_pos;
+                                }
+                                if (vc->vc_x == vc->vc_cols - 1) {
+                                        vc->vc_need_wrap = vc->vc_decawm;
+                                        draw_to = vc->vc_pos + 2;
+                                } else {
+                                        vc->vc_x++;
+                                        draw_to = (vc->vc_pos += 2);
                                }
-                                vc->vc_utf_count = 0;
+                                if (!--width) break;
+                                tc = conv_uni_to_pc(vc, ' '); /* A space is printed in the second column */
+                                if (tc < 0) tc = ' ';
+                        }
+                        if (rescan) {
+                                rescan = 0;
+                                inverse = 0;
+                                width = 1;
                                c = orig;
                                goto rescan_last_byte;
                        }
author	Egmont Koblinger <egmont@uhulinux.hu>	2007-05-08 03:30:37 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-08 14:15:12 -0400
commit	2f1a2ccb9c0de632ab07193becf5f7121794f6ae (patch)
tree	8fffd5aa34634ad6809e05e0dcbeebec5d039d3f /drivers/char/vt.c
parent	e659ba4a0d2d471c0d73590f78e1a1b5a1eede48 (diff)

diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 1bbb45b937fd..afd00464184e 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c
@@ -1932,6 +1932,46 @@ static void do_con_trol(struct tty_struct tty, struct vc_data vc, int c)
1932	char con_buf[CON_BUF_SIZE];	1932	char con_buf[CON_BUF_SIZE];
1933	DECLARE_MUTEX(con_buf_sem);	1933	DECLARE_MUTEX(con_buf_sem);
1934		1934
		1935	/* is_double_width() is based on the wcwidth() implementation by
		1936	* Markus Kuhn -- 2003-05-20 (Unicode 4.0)
		1937	* Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
		1938	*/
		1939	struct interval {
		1940	uint32_t first;
		1941	uint32_t last;
		1942	};
		1943
		1944	static int bisearch(uint32_t ucs, const struct interval *table, int max)
		1945	{
		1946	int min = 0;
		1947	int mid;
		1948
		1949	if (ucs < table[0].first \|\| ucs > table[max].last)
		1950	return 0;
		1951	while (max >= min) {
		1952	mid = (min + max) / 2;
		1953	if (ucs > table[mid].last)
		1954	min = mid + 1;
		1955	else if (ucs < table[mid].first)
		1956	max = mid - 1;
		1957	else
		1958	return 1;
		1959	}
		1960	return 0;
		1961	}
		1962
		1963	static int is_double_width(uint32_t ucs)
		1964	{
		1965	static const struct interval double_width[] = {
		1966	{ 0x1100, 0x115F }, { 0x2329, 0x232A }, { 0x2E80, 0x303E },
		1967	{ 0x3040, 0xA4CF }, { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF },
		1968	{ 0xFE30, 0xFE6F }, { 0xFF00, 0xFF60 }, { 0xFFE0, 0xFFE6 },
		1969	{ 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }
		1970	};
		1971	return bisearch(ucs, double_width,
		1972	sizeof(double_width) / sizeof(*double_width) - 1);
		1973	}
		1974
1935	/* acquires console_sem */	1975	/* acquires console_sem */
1936	static int do_con_write(struct tty_struct tty, const unsigned char buf, int count)	1976	static int do_con_write(struct tty_struct tty, const unsigned char buf, int count)
1937	{	1977	{
@@ -1948,6 +1988,10 @@ static int do_con_write(struct tty_struct tty, const unsigned char buf, int co
1948	unsigned int currcons;	1988	unsigned int currcons;
1949	unsigned long draw_from = 0, draw_to = 0;	1989	unsigned long draw_from = 0, draw_to = 0;
1950	struct vc_data *vc;	1990	struct vc_data *vc;
		1991	unsigned char vc_attr;
		1992	uint8_t rescan;
		1993	uint8_t inverse;
		1994	uint8_t width;
1951	u16 himask, charmask;	1995	u16 himask, charmask;
1952	const unsigned char *orig_buf = NULL;	1996	const unsigned char *orig_buf = NULL;
1953	int orig_count;	1997	int orig_count;
@@ -2010,53 +2054,86 @@ static int do_con_write(struct tty_struct tty, const unsigned char buf, int co
2010	buf++;	2054	buf++;
2011	n++;	2055	n++;
2012	count--;	2056	count--;
		2057	rescan = 0;
		2058	inverse = 0;
		2059	width = 1;
2013		2060
2014	/* Do no translation at all in control states */	2061	/* Do no translation at all in control states */
2015	if (vc->vc_state != ESnormal) {	2062	if (vc->vc_state != ESnormal) {
2016	tc = c;	2063	tc = c;
2017	} else if (vc->vc_utf && !vc->vc_disp_ctrl) {	2064	} else if (vc->vc_utf && !vc->vc_disp_ctrl) {
2018	/* Combine UTF-8 into Unicode */	2065	/* Combine UTF-8 into Unicode in vc_utf_char.
2019	/* Malformed sequences as sequences of replacement glyphs */	2066	* vc_utf_count is the number of continuation bytes still
		2067	* expected to arrive.
		2068	* vc_npar is the number of continuation bytes arrived so
		2069	* far
		2070	*/
2020	rescan_last_byte:	2071	rescan_last_byte:
2021	if(c > 0x7f) {	2072	if ((c & 0xc0) == 0x80) {
		2073	/* Continuation byte received */
		2074	static const uint32_t utf8_length_changes[] = { 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff };
2022	if (vc->vc_utf_count) {	2075	if (vc->vc_utf_count) {
2023	if ((c & 0xc0) == 0x80) {	2076	vc->vc_utf_char = (vc->vc_utf_char << 6) \| (c & 0x3f);
2024	vc->vc_utf_char = (vc->vc_utf_char << 6) \| (c & 0x3f);	2077	vc->vc_npar++;
2025	if (--vc->vc_utf_count) {	2078	if (--vc->vc_utf_count) {
2026	vc->vc_npar++;	2079	/* Still need some bytes */
2027	continue;
2028	}
2029	tc = c = vc->vc_utf_char;
2030	} else
2031	goto replacement_glyph;
2032	} else {
2033	vc->vc_npar = 0;
2034	if ((c & 0xe0) == 0xc0) {
2035	vc->vc_utf_count = 1;
2036	vc->vc_utf_char = (c & 0x1f);
2037	} else if ((c & 0xf0) == 0xe0) {
2038	vc->vc_utf_count = 2;
2039	vc->vc_utf_char = (c & 0x0f);
2040	} else if ((c & 0xf8) == 0xf0) {
2041	vc->vc_utf_count = 3;
2042	vc->vc_utf_char = (c & 0x07);
2043	} else if ((c & 0xfc) == 0xf8) {
2044	vc->vc_utf_count = 4;
2045	vc->vc_utf_char = (c & 0x03);
2046	} else if ((c & 0xfe) == 0xfc) {
2047	vc->vc_utf_count = 5;
2048	vc->vc_utf_char = (c & 0x01);
2049	} else
2050	goto replacement_glyph;
2051	continue;	2080	continue;
2052	}	2081	}
		2082	/* Got a whole character */
		2083	c = vc->vc_utf_char;
		2084	/* Reject overlong sequences */
		2085	if (c <= utf8_length_changes[vc->vc_npar - 1] \|\|
		2086	c > utf8_length_changes[vc->vc_npar])
		2087	c = 0xfffd;
		2088	} else {
		2089	/* Unexpected continuation byte */
		2090	vc->vc_utf_count = 0;
		2091	c = 0xfffd;
		2092	}
2053	} else {	2093	} else {
2054	if (vc->vc_utf_count)	2094	/* Single ASCII byte or first byte of a sequence received */
2055	goto replacement_glyph;	2095	if (vc->vc_utf_count) {
2056	tc = c;	2096	/* Continuation byte expected */
		2097	rescan = 1;
		2098	vc->vc_utf_count = 0;
		2099	c = 0xfffd;
		2100	} else if (c > 0x7f) {
		2101	/* First byte of a multibyte sequence received */
		2102	vc->vc_npar = 0;
		2103	if ((c & 0xe0) == 0xc0) {
		2104	vc->vc_utf_count = 1;
		2105	vc->vc_utf_char = (c & 0x1f);
		2106	} else if ((c & 0xf0) == 0xe0) {
		2107	vc->vc_utf_count = 2;
		2108	vc->vc_utf_char = (c & 0x0f);
		2109	} else if ((c & 0xf8) == 0xf0) {
		2110	vc->vc_utf_count = 3;
		2111	vc->vc_utf_char = (c & 0x07);
		2112	} else if ((c & 0xfc) == 0xf8) {
		2113	vc->vc_utf_count = 4;
		2114	vc->vc_utf_char = (c & 0x03);
		2115	} else if ((c & 0xfe) == 0xfc) {
		2116	vc->vc_utf_count = 5;
		2117	vc->vc_utf_char = (c & 0x01);
		2118	} else {
		2119	/* 254 and 255 are invalid */
		2120	c = 0xfffd;
		2121	}
		2122	if (vc->vc_utf_count) {
		2123	/* Still need some bytes */
		2124	continue;
		2125	}
		2126	}
		2127	/* Nothing to do if an ASCII byte was received */
2057	}	2128	}
		2129	/* End of UTF-8 decoding. */
		2130	/* c is the received character, or U+FFFD for invalid sequences. */
		2131	/* Replace invalid Unicode code points with U+FFFD too */
		2132	if ((c >= 0xd800 && c <= 0xdfff) \|\| c == 0xfffe \|\| c == 0xffff)
		2133	c = 0xfffd;
		2134	tc = c;
2058	} else { /* no utf or alternate charset mode */	2135	} else { /* no utf or alternate charset mode */
2059	tc = vc->vc_translate[vc->vc_toggle_meta ? (c \| 0x80) : c];	2136	tc = vc->vc_translate[vc->vc_toggle_meta ? (c \| 0x80) : c];
2060	}	2137	}
2061		2138
2062	/* If the original code was a control character we	2139	/* If the original code was a control character we
@@ -2076,56 +2153,80 @@ rescan_last_byte:
2076	&& (c != 128+27);	2153	&& (c != 128+27);
2077		2154
2078	if (vc->vc_state == ESnormal && ok) {	2155	if (vc->vc_state == ESnormal && ok) {
		2156	if (vc->vc_utf && !vc->vc_disp_ctrl) {
		2157	if (is_double_width(c))
		2158	width = 2;
		2159	}
2079	/* Now try to find out how to display it */	2160	/* Now try to find out how to display it */
2080	tc = conv_uni_to_pc(vc, tc);	2161	tc = conv_uni_to_pc(vc, tc);
2081	if (tc & ~charmask) {	2162	if (tc & ~charmask) {
2082	if ( tc == -4 ) {	2163	if (tc == -1 \|\| tc == -2) {
2083	/* If we got -4 (not found) then see if we have	2164	continue; /* nothing to display */
2084	defined a replacement character (U+FFFD) */	2165	}
2085	replacement_glyph:	2166	/* Glyph not found */
2086	tc = conv_uni_to_pc(vc, 0xfffd);	2167	if (!(vc->vc_utf && !vc->vc_disp_ctrl) && !(c & ~charmask)) {
2087	if (!(tc & ~charmask))	2168	/* In legacy mode use the glyph we get by a 1:1 mapping.
2088	goto display_glyph;	2169	This would make absolutely no sense with Unicode in mind. */
2089	} else if ( tc != -3 )	2170	tc = c;
2090	continue; /* nothing to display */	2171	} else {
2091	/* no hash table or no replacement --	2172	/* Display U+FFFD. If it's not found, display an inverse question mark. */
2092	* hope for the best */	2173	tc = conv_uni_to_pc(vc, 0xfffd);
2093	if ( c & ~charmask )	2174	if (tc < 0) {
2094	tc = '?';	2175	inverse = 1;
2095	else	2176	tc = conv_uni_to_pc(vc, '?');
2096	tc = c;	2177	if (tc < 0) tc = '?';
		2178	}
		2179	}
2097	}	2180	}
2098		2181
2099	display_glyph:	2182	if (!inverse) {
2100	if (vc->vc_need_wrap \|\| vc->vc_decim)	2183	vc_attr = vc->vc_attr;
2101	FLUSH
2102	if (vc->vc_need_wrap) {
2103	cr(vc);
2104	lf(vc);
2105	}
2106	if (vc->vc_decim)
2107	insert_char(vc, 1);
2108	scr_writew(himask ?
2109	((vc->vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
2110	(vc->vc_attr << 8) + tc,
2111	(u16 *) vc->vc_pos);
2112	if (DO_UPDATE(vc) && draw_x < 0) {
2113	draw_x = vc->vc_x;
2114	draw_from = vc->vc_pos;
2115	}
2116	if (vc->vc_x == vc->vc_cols - 1) {
2117	vc->vc_need_wrap = vc->vc_decawm;
2118	draw_to = vc->vc_pos + 2;
2119	} else {	2184	} else {
2120	vc->vc_x++;	2185	/* invert vc_attr */
2121	draw_to = (vc->vc_pos += 2);	2186	if (!vc->vc_can_do_color) {
		2187	vc_attr = (vc->vc_attr) ^ 0x08;
		2188	} else if (vc->vc_hi_font_mask == 0x100) {
		2189	vc_attr = ((vc->vc_attr) & 0x11) \| (((vc->vc_attr) & 0xe0) >> 4) \| (((vc->vc_attr) & 0x0e) << 4);
		2190	} else {
		2191	vc_attr = ((vc->vc_attr) & 0x88) \| (((vc->vc_attr) & 0x70) >> 4) \| (((vc->vc_attr) & 0x07) << 4);
		2192	}
2122	}	2193	}
2123	if (vc->vc_utf_count) {	2194
2124	if (vc->vc_npar) {	2195	while (1) {
2125	vc->vc_npar--;	2196	if (vc->vc_need_wrap \|\| vc->vc_decim)
2126	goto display_glyph;	2197	FLUSH
		2198	if (vc->vc_need_wrap) {
		2199	cr(vc);
		2200	lf(vc);
		2201	}
		2202	if (vc->vc_decim)
		2203	insert_char(vc, 1);
		2204	scr_writew(himask ?
		2205	((vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
		2206	(vc_attr << 8) + tc,
		2207	(u16 *) vc->vc_pos);
		2208	if (DO_UPDATE(vc) && draw_x < 0) {
		2209	draw_x = vc->vc_x;
		2210	draw_from = vc->vc_pos;
		2211	}
		2212	if (vc->vc_x == vc->vc_cols - 1) {
		2213	vc->vc_need_wrap = vc->vc_decawm;
		2214	draw_to = vc->vc_pos + 2;
		2215	} else {
		2216	vc->vc_x++;
		2217	draw_to = (vc->vc_pos += 2);
2127	}	2218	}
2128	vc->vc_utf_count = 0;	2219
		2220	if (!--width) break;
		2221
		2222	tc = conv_uni_to_pc(vc, ' '); /* A space is printed in the second column */
		2223	if (tc < 0) tc = ' ';
		2224	}
		2225
		2226	if (rescan) {
		2227	rescan = 0;
		2228	inverse = 0;
		2229	width = 1;
2129	c = orig;	2230	c = orig;
2130	goto rescan_last_byte;	2231	goto rescan_last_byte;
2131	}	2232	}