[PATCH v2 09/13] vt: support Unicode recomposition

From: Nicolas Pitre
Date: Tue Apr 15 2025 - 15:23:52 EST


From: Nicolas Pitre <npitre@xxxxxxxxxxxx>

Try replacing any decomposed Unicode sequence by the corresponding
recomposed code point. Code point to glyph correspondance works best
after recomposition, and this apply mostly to single-width code points
therefore we can't preserve them in their decomposed form anyway.

Signed-off-by: Nicolas Pitre <npitre@xxxxxxxxxxxx>
---
drivers/tty/vt/ucs.c | 62 ++++++++++++++++++++++++++++++++++++++
drivers/tty/vt/vt.c | 14 +++++++--
include/linux/consolemap.h | 6 ++++
3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/drivers/tty/vt/ucs.c b/drivers/tty/vt/ucs.c
index 5e71aa3896..07b2bd1714 100644
--- a/drivers/tty/vt/ucs.c
+++ b/drivers/tty/vt/ucs.c
@@ -56,3 +56,65 @@ bool ucs_is_double_width(u32 cp)
return cp_in_range(cp, ucs_double_width_ranges,
ARRAY_SIZE(ucs_double_width_ranges));
}
+
+/*
+ * Structure for base with combining mark pairs and resulting recompositions.
+ * Using u16 to save space since all values are within BMP range.
+ */
+struct ucs_recomposition {
+ u16 base; /* base character */
+ u16 mark; /* combining mark */
+ u16 recomposed; /* corresponding recomposed character */
+};
+
+#include "ucs_recompose_table.h"
+
+struct compare_key {
+ u16 base;
+ u16 mark;
+};
+
+static int recomposition_cmp(const void *key, const void *element)
+{
+ const struct compare_key *search_key = key;
+ const struct ucs_recomposition *entry = element;
+
+ /* Compare base character first */
+ if (search_key->base < entry->base)
+ return -1;
+ if (search_key->base > entry->base)
+ return 1;
+
+ /* Base characters match, now compare combining character */
+ if (search_key->mark < entry->mark)
+ return -1;
+ if (search_key->mark > entry->mark)
+ return 1;
+
+ /* Both match */
+ return 0;
+}
+
+/**
+ * Attempt to recompose two Unicode characters into a single character.
+ *
+ * @param base: Base Unicode code point (UCS-4)
+ * @param mark: Combining mark Unicode code point (UCS-4)
+ * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
+ */
+u32 ucs_recompose(u32 base, u32 mark)
+{
+ /* Check if characters are within the range of our table */
+ if (!in_range(base, UCS_RECOMPOSE_MIN_BASE, UCS_RECOMPOSE_MAX_BASE) ||
+ !in_range(mark, UCS_RECOMPOSE_MIN_MARK, UCS_RECOMPOSE_MAX_MARK))
+ return 0;
+
+ struct compare_key key = { base, mark };
+ struct ucs_recomposition *result =
+ __inline_bsearch(&key, ucs_recomposition_table,
+ ARRAY_SIZE(ucs_recomposition_table),
+ sizeof(*ucs_recomposition_table),
+ recomposition_cmp);
+
+ return result ? result->recomposed : 0;
+}
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index a989feffad..76554c2040 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -2925,9 +2925,9 @@ static void vc_con_rewind(struct vc_data *vc)

#define UCS_VS16 0xfe0f /* Variation Selector 16 */

-static int vc_process_ucs(struct vc_data *vc, int c, int *tc)
+static int vc_process_ucs(struct vc_data *vc, int *c, int *tc)
{
- u32 prev_c, curr_c = c;
+ u32 prev_c, curr_c = *c;

if (ucs_is_double_width(curr_c))
return 2;
@@ -2964,6 +2964,14 @@ static int vc_process_ucs(struct vc_data *vc, int c, int *tc)
return 1;
}

+ /* try recomposition */
+ prev_c = ucs_recompose(prev_c, curr_c);
+ if (prev_c != 0) {
+ vc_con_rewind(vc);
+ *tc = *c = prev_c;
+ return 1;
+ }
+
/* Otherwise zero-width code points are ignored. */
return 0;
}
@@ -2978,7 +2986,7 @@ static int vc_con_write_normal(struct vc_data *vc, int tc, int c,
bool inverse = false;

if (vc->vc_utf && !vc->vc_disp_ctrl) {
- width = vc_process_ucs(vc, c, &tc);
+ width = vc_process_ucs(vc, &c, &tc);
if (!width)
goto out;
}
diff --git a/include/linux/consolemap.h b/include/linux/consolemap.h
index b3a9118666..8167494229 100644
--- a/include/linux/consolemap.h
+++ b/include/linux/consolemap.h
@@ -30,6 +30,7 @@ int conv_uni_to_8bit(u32 uni);
void console_map_init(void);
bool ucs_is_double_width(uint32_t cp);
bool ucs_is_zero_width(uint32_t cp);
+u32 ucs_recompose(u32 base, u32 mark);
#else
static inline u16 inverse_translate(const struct vc_data *conp, u16 glyph,
bool use_unicode)
@@ -69,6 +70,11 @@ static inline bool ucs_is_zero_width(uint32_t cp)
{
return false;
}
+
+static inline u32 ucs_recompose(u32 base, u32 mark)
+{
+ return 0;
+}
#endif /* CONFIG_CONSOLE_TRANSLATIONS */

#endif /* __LINUX_CONSOLEMAP_H__ */
--
2.49.0