aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorEgmont Koblinger <egmont@uhulinux.hu>2007-05-08 03:30:37 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-08 14:15:12 -0400
commit2f1a2ccb9c0de632ab07193becf5f7121794f6ae (patch)
tree8fffd5aa34634ad6809e05e0dcbeebec5d039d3f /drivers
parente659ba4a0d2d471c0d73590f78e1a1b5a1eede48 (diff)
console UTF-8 fixes
The UTF-8 part of the vt driver suffers from the following issues which are addressed in my patch: 1) If there's no glyph found for a particular valid UTF-8 character, we try to display U+FFFD. However if this one is not found either, here's what the current kernel does: - First, if the Unicode value is less than the number of glyphs, use the glyph directly from that position of the glyph table. While it may be a good idea in the 8-bit world, it has absolutely no sense with Unicode in mind. For example, if a Latin-2 font is loaded and an application prints U+00FB ("u with circumflex", not present in Latin-2) then as a fallback solution the glyph from the 0xFB position of the Latin-2 fontset (which is an "u with double accent" - a different character) is displayed. - Second, if this fallback fails too, a simple ASCII question mark is printed, which is visually undistinguishable from a real question mark. I changed the code to skip the first step (except if in non-UTF-8 mode), and changed the second step to print the question mark with inverse color attributes, so it is visually clear that it's not a real question mark, and resembles more to the common glyph of U+FFFD. 2) The UTF-8 decoder is buggy in many ways: - Lone continuation bytes (section 3.1 of Markus Kuhn's UTF-8 stress test) are not caught, they are displayed as some "random" (taken directly form the font table, see above) glyphs instead the replacement character. - Incomplete sequences (sections 3.2 and 3.3 of the stress test) emit no replacement character, but rather cause the subsequent valid character to be displayed more times(!). - The decoder is not safe: overlong sequences are not caught currently, they are displayed as if these were valid representations. This may even have security impacts. - The decoder does not handle D800..DFFF and FFFE..FFFF specially, it just emits these code points and lets it be looked up in the glyph table. Since these are invalid code points, I replace them by U+FFFD and hence give no chance for them to be looked up in the glyph table. (Assuming no font ships glyphs for these code points, this change is not visible to the users since the glyph shown will be the same.) With my fixes to the decoder it now behaves exactly as Markus Kuhn's stress test recommends. 3) It has no concept of double-width (CJK) characters. It's way beyond the scope of my patch to try to display them, but at least I think it's important for the cursor to jump two positions when printing such characters, since this is what applications (such as text editors) expect. Currently the cursor only jumps one position, and hence applications suffer from displaying and refreshing problems, and editing some English letters that are preceded by some CJK characters in the same line is a nightmare. With my patch an additional space is inserted after the CJK character has been printed (which usually means a replacement symbol of course). (If U+FFFD isn't availble and hence an inverse question mark is displayed in the first cell, I keep the inverted state for the space in the 2nd column so it's quite easy to see that they are tied together.) 4) There is a small built-in table of zero-width spaces that are not to be printed but silently skipped. U+200A is included there, but it's not a zero-width character, so I remove it from there. Signed-off-by: Egmont Koblinger <egmont@uhulinux.hu> Cc: Jan Engelhardt <jengelh@linux01.gwdg.de> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Antonino A. Daplas" <adaplas@pol.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/char/consolemap.c6
-rw-r--r--drivers/char/vt.c257
2 files changed, 182 insertions, 81 deletions
diff --git a/drivers/char/consolemap.c b/drivers/char/consolemap.c
index b99b7561260d..fd40b959afdd 100644
--- a/drivers/char/consolemap.c
+++ b/drivers/char/consolemap.c
@@ -626,10 +626,10 @@ conv_uni_to_pc(struct vc_data *conp, long ucs)
626 626
627 /* Only 16-bit codes supported at this time */ 627 /* Only 16-bit codes supported at this time */
628 if (ucs > 0xffff) 628 if (ucs > 0xffff)
629 ucs = 0xfffd; /* U+FFFD: REPLACEMENT CHARACTER */ 629 return -4; /* Not found */
630 else if (ucs < 0x20 || ucs >= 0xfffe) 630 else if (ucs < 0x20)
631 return -1; /* Not a printable character */ 631 return -1; /* Not a printable character */
632 else if (ucs == 0xfeff || (ucs >= 0x200a && ucs <= 0x200f)) 632 else if (ucs == 0xfeff || (ucs >= 0x200b && ucs <= 0x200f))
633 return -2; /* Zero-width space */ 633 return -2; /* Zero-width space */
634 /* 634 /*
635 * UNI_DIRECT_BASE indicates the start of the region in the User Zone 635 * UNI_DIRECT_BASE indicates the start of the region in the User Zone
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 1bbb45b937fd..afd00464184e 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -1932,6 +1932,46 @@ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c)
1932char con_buf[CON_BUF_SIZE]; 1932char con_buf[CON_BUF_SIZE];
1933DECLARE_MUTEX(con_buf_sem); 1933DECLARE_MUTEX(con_buf_sem);
1934 1934
1935/* is_double_width() is based on the wcwidth() implementation by
1936 * Markus Kuhn -- 2003-05-20 (Unicode 4.0)
1937 * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
1938 */
1939struct interval {
1940 uint32_t first;
1941 uint32_t last;
1942};
1943
1944static int bisearch(uint32_t ucs, const struct interval *table, int max)
1945{
1946 int min = 0;
1947 int mid;
1948
1949 if (ucs < table[0].first || ucs > table[max].last)
1950 return 0;
1951 while (max >= min) {
1952 mid = (min + max) / 2;
1953 if (ucs > table[mid].last)
1954 min = mid + 1;
1955 else if (ucs < table[mid].first)
1956 max = mid - 1;
1957 else
1958 return 1;
1959 }
1960 return 0;
1961}
1962
1963static int is_double_width(uint32_t ucs)
1964{
1965 static const struct interval double_width[] = {
1966 { 0x1100, 0x115F }, { 0x2329, 0x232A }, { 0x2E80, 0x303E },
1967 { 0x3040, 0xA4CF }, { 0xAC00, 0xD7A3 }, { 0xF900, 0xFAFF },
1968 { 0xFE30, 0xFE6F }, { 0xFF00, 0xFF60 }, { 0xFFE0, 0xFFE6 },
1969 { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }
1970 };
1971 return bisearch(ucs, double_width,
1972 sizeof(double_width) / sizeof(*double_width) - 1);
1973}
1974
1935/* acquires console_sem */ 1975/* acquires console_sem */
1936static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count) 1976static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count)
1937{ 1977{
@@ -1948,6 +1988,10 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
1948 unsigned int currcons; 1988 unsigned int currcons;
1949 unsigned long draw_from = 0, draw_to = 0; 1989 unsigned long draw_from = 0, draw_to = 0;
1950 struct vc_data *vc; 1990 struct vc_data *vc;
1991 unsigned char vc_attr;
1992 uint8_t rescan;
1993 uint8_t inverse;
1994 uint8_t width;
1951 u16 himask, charmask; 1995 u16 himask, charmask;
1952 const unsigned char *orig_buf = NULL; 1996 const unsigned char *orig_buf = NULL;
1953 int orig_count; 1997 int orig_count;
@@ -2010,53 +2054,86 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
2010 buf++; 2054 buf++;
2011 n++; 2055 n++;
2012 count--; 2056 count--;
2057 rescan = 0;
2058 inverse = 0;
2059 width = 1;
2013 2060
2014 /* Do no translation at all in control states */ 2061 /* Do no translation at all in control states */
2015 if (vc->vc_state != ESnormal) { 2062 if (vc->vc_state != ESnormal) {
2016 tc = c; 2063 tc = c;
2017 } else if (vc->vc_utf && !vc->vc_disp_ctrl) { 2064 } else if (vc->vc_utf && !vc->vc_disp_ctrl) {
2018 /* Combine UTF-8 into Unicode */ 2065 /* Combine UTF-8 into Unicode in vc_utf_char.
2019 /* Malformed sequences as sequences of replacement glyphs */ 2066 * vc_utf_count is the number of continuation bytes still
2067 * expected to arrive.
2068 * vc_npar is the number of continuation bytes arrived so
2069 * far
2070 */
2020rescan_last_byte: 2071rescan_last_byte:
2021 if(c > 0x7f) { 2072 if ((c & 0xc0) == 0x80) {
2073 /* Continuation byte received */
2074 static const uint32_t utf8_length_changes[] = { 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff };
2022 if (vc->vc_utf_count) { 2075 if (vc->vc_utf_count) {
2023 if ((c & 0xc0) == 0x80) { 2076 vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f);
2024 vc->vc_utf_char = (vc->vc_utf_char << 6) | (c & 0x3f); 2077 vc->vc_npar++;
2025 if (--vc->vc_utf_count) { 2078 if (--vc->vc_utf_count) {
2026 vc->vc_npar++; 2079 /* Still need some bytes */
2027 continue;
2028 }
2029 tc = c = vc->vc_utf_char;
2030 } else
2031 goto replacement_glyph;
2032 } else {
2033 vc->vc_npar = 0;
2034 if ((c & 0xe0) == 0xc0) {
2035 vc->vc_utf_count = 1;
2036 vc->vc_utf_char = (c & 0x1f);
2037 } else if ((c & 0xf0) == 0xe0) {
2038 vc->vc_utf_count = 2;
2039 vc->vc_utf_char = (c & 0x0f);
2040 } else if ((c & 0xf8) == 0xf0) {
2041 vc->vc_utf_count = 3;
2042 vc->vc_utf_char = (c & 0x07);
2043 } else if ((c & 0xfc) == 0xf8) {
2044 vc->vc_utf_count = 4;
2045 vc->vc_utf_char = (c & 0x03);
2046 } else if ((c & 0xfe) == 0xfc) {
2047 vc->vc_utf_count = 5;
2048 vc->vc_utf_char = (c & 0x01);
2049 } else
2050 goto replacement_glyph;
2051 continue; 2080 continue;
2052 } 2081 }
2082 /* Got a whole character */
2083 c = vc->vc_utf_char;
2084 /* Reject overlong sequences */
2085 if (c <= utf8_length_changes[vc->vc_npar - 1] ||
2086 c > utf8_length_changes[vc->vc_npar])
2087 c = 0xfffd;
2088 } else {
2089 /* Unexpected continuation byte */
2090 vc->vc_utf_count = 0;
2091 c = 0xfffd;
2092 }
2053 } else { 2093 } else {
2054 if (vc->vc_utf_count) 2094 /* Single ASCII byte or first byte of a sequence received */
2055 goto replacement_glyph; 2095 if (vc->vc_utf_count) {
2056 tc = c; 2096 /* Continuation byte expected */
2097 rescan = 1;
2098 vc->vc_utf_count = 0;
2099 c = 0xfffd;
2100 } else if (c > 0x7f) {
2101 /* First byte of a multibyte sequence received */
2102 vc->vc_npar = 0;
2103 if ((c & 0xe0) == 0xc0) {
2104 vc->vc_utf_count = 1;
2105 vc->vc_utf_char = (c & 0x1f);
2106 } else if ((c & 0xf0) == 0xe0) {
2107 vc->vc_utf_count = 2;
2108 vc->vc_utf_char = (c & 0x0f);
2109 } else if ((c & 0xf8) == 0xf0) {
2110 vc->vc_utf_count = 3;
2111 vc->vc_utf_char = (c & 0x07);
2112 } else if ((c & 0xfc) == 0xf8) {
2113 vc->vc_utf_count = 4;
2114 vc->vc_utf_char = (c & 0x03);
2115 } else if ((c & 0xfe) == 0xfc) {
2116 vc->vc_utf_count = 5;
2117 vc->vc_utf_char = (c & 0x01);
2118 } else {
2119 /* 254 and 255 are invalid */
2120 c = 0xfffd;
2121 }
2122 if (vc->vc_utf_count) {
2123 /* Still need some bytes */
2124 continue;
2125 }
2126 }
2127 /* Nothing to do if an ASCII byte was received */
2057 } 2128 }
2129 /* End of UTF-8 decoding. */
2130 /* c is the received character, or U+FFFD for invalid sequences. */
2131 /* Replace invalid Unicode code points with U+FFFD too */
2132 if ((c >= 0xd800 && c <= 0xdfff) || c == 0xfffe || c == 0xffff)
2133 c = 0xfffd;
2134 tc = c;
2058 } else { /* no utf or alternate charset mode */ 2135 } else { /* no utf or alternate charset mode */
2059 tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c]; 2136 tc = vc->vc_translate[vc->vc_toggle_meta ? (c | 0x80) : c];
2060 } 2137 }
2061 2138
2062 /* If the original code was a control character we 2139 /* If the original code was a control character we
@@ -2076,56 +2153,80 @@ rescan_last_byte:
2076 && (c != 128+27); 2153 && (c != 128+27);
2077 2154
2078 if (vc->vc_state == ESnormal && ok) { 2155 if (vc->vc_state == ESnormal && ok) {
2156 if (vc->vc_utf && !vc->vc_disp_ctrl) {
2157 if (is_double_width(c))
2158 width = 2;
2159 }
2079 /* Now try to find out how to display it */ 2160 /* Now try to find out how to display it */
2080 tc = conv_uni_to_pc(vc, tc); 2161 tc = conv_uni_to_pc(vc, tc);
2081 if (tc & ~charmask) { 2162 if (tc & ~charmask) {
2082 if ( tc == -4 ) { 2163 if (tc == -1 || tc == -2) {
2083 /* If we got -4 (not found) then see if we have 2164 continue; /* nothing to display */
2084 defined a replacement character (U+FFFD) */ 2165 }
2085replacement_glyph: 2166 /* Glyph not found */
2086 tc = conv_uni_to_pc(vc, 0xfffd); 2167 if (!(vc->vc_utf && !vc->vc_disp_ctrl) && !(c & ~charmask)) {
2087 if (!(tc & ~charmask)) 2168 /* In legacy mode use the glyph we get by a 1:1 mapping.
2088 goto display_glyph; 2169 This would make absolutely no sense with Unicode in mind. */
2089 } else if ( tc != -3 ) 2170 tc = c;
2090 continue; /* nothing to display */ 2171 } else {
2091 /* no hash table or no replacement -- 2172 /* Display U+FFFD. If it's not found, display an inverse question mark. */
2092 * hope for the best */ 2173 tc = conv_uni_to_pc(vc, 0xfffd);
2093 if ( c & ~charmask ) 2174 if (tc < 0) {
2094 tc = '?'; 2175 inverse = 1;
2095 else 2176 tc = conv_uni_to_pc(vc, '?');
2096 tc = c; 2177 if (tc < 0) tc = '?';
2178 }
2179 }
2097 } 2180 }
2098 2181
2099display_glyph: 2182 if (!inverse) {
2100 if (vc->vc_need_wrap || vc->vc_decim) 2183 vc_attr = vc->vc_attr;
2101 FLUSH
2102 if (vc->vc_need_wrap) {
2103 cr(vc);
2104 lf(vc);
2105 }
2106 if (vc->vc_decim)
2107 insert_char(vc, 1);
2108 scr_writew(himask ?
2109 ((vc->vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
2110 (vc->vc_attr << 8) + tc,
2111 (u16 *) vc->vc_pos);
2112 if (DO_UPDATE(vc) && draw_x < 0) {
2113 draw_x = vc->vc_x;
2114 draw_from = vc->vc_pos;
2115 }
2116 if (vc->vc_x == vc->vc_cols - 1) {
2117 vc->vc_need_wrap = vc->vc_decawm;
2118 draw_to = vc->vc_pos + 2;
2119 } else { 2184 } else {
2120 vc->vc_x++; 2185 /* invert vc_attr */
2121 draw_to = (vc->vc_pos += 2); 2186 if (!vc->vc_can_do_color) {
2187 vc_attr = (vc->vc_attr) ^ 0x08;
2188 } else if (vc->vc_hi_font_mask == 0x100) {
2189 vc_attr = ((vc->vc_attr) & 0x11) | (((vc->vc_attr) & 0xe0) >> 4) | (((vc->vc_attr) & 0x0e) << 4);
2190 } else {
2191 vc_attr = ((vc->vc_attr) & 0x88) | (((vc->vc_attr) & 0x70) >> 4) | (((vc->vc_attr) & 0x07) << 4);
2192 }
2122 } 2193 }
2123 if (vc->vc_utf_count) { 2194
2124 if (vc->vc_npar) { 2195 while (1) {
2125 vc->vc_npar--; 2196 if (vc->vc_need_wrap || vc->vc_decim)
2126 goto display_glyph; 2197 FLUSH
2198 if (vc->vc_need_wrap) {
2199 cr(vc);
2200 lf(vc);
2201 }
2202 if (vc->vc_decim)
2203 insert_char(vc, 1);
2204 scr_writew(himask ?
2205 ((vc_attr << 8) & ~himask) + ((tc & 0x100) ? himask : 0) + (tc & 0xff) :
2206 (vc_attr << 8) + tc,
2207 (u16 *) vc->vc_pos);
2208 if (DO_UPDATE(vc) && draw_x < 0) {
2209 draw_x = vc->vc_x;
2210 draw_from = vc->vc_pos;
2211 }
2212 if (vc->vc_x == vc->vc_cols - 1) {
2213 vc->vc_need_wrap = vc->vc_decawm;
2214 draw_to = vc->vc_pos + 2;
2215 } else {
2216 vc->vc_x++;
2217 draw_to = (vc->vc_pos += 2);
2127 } 2218 }
2128 vc->vc_utf_count = 0; 2219
2220 if (!--width) break;
2221
2222 tc = conv_uni_to_pc(vc, ' '); /* A space is printed in the second column */
2223 if (tc < 0) tc = ' ';
2224 }
2225
2226 if (rescan) {
2227 rescan = 0;
2228 inverse = 0;
2229 width = 1;
2129 c = orig; 2230 c = orig;
2130 goto rescan_last_byte; 2231 goto rescan_last_byte;
2131 } 2232 }