[PATCHv2] speakup: Turn i18n files utf-8

From: Samuel Thibault
Date: Wed Mar 27 2024 - 07:51:14 EST


i18n currently assume latin1 encoding, which is not enough for most
languages.

This separates out the utf-8 processing of /dev/synthu, and uses it for
a new synth_writeu, which we make synth_printf now use. This has the
effect of making all the i18 messages processed in utf-8.

Signed-off-by: Samuel Thibault <samuel.thibault@xxxxxxxxxxxx>
---
Change since v1: Refresh patch on top of linus tree

drivers/accessibility/speakup/devsynth.c | 59 ++++-----------
drivers/accessibility/speakup/speakup.h | 2 +
drivers/accessibility/speakup/synth.c | 92 ++++++++++++++++++++++--
3 files changed, 102 insertions(+), 51 deletions(-)

diff --git a/drivers/accessibility/speakup/devsynth.c b/drivers/accessibility/speakup/devsynth.c
index cb7e1114e8eb..e3d909bd0480 100644
--- a/drivers/accessibility/speakup/devsynth.c
+++ b/drivers/accessibility/speakup/devsynth.c
@@ -39,13 +39,13 @@ static ssize_t speakup_file_write(struct file *fp, const char __user *buffer,
static ssize_t speakup_file_writeu(struct file *fp, const char __user *buffer,
size_t nbytes, loff_t *ppos)
{
- size_t count = nbytes, want;
+ size_t count = nbytes, consumed, want;
const char __user *ptr = buffer;
size_t bytes;
unsigned long flags;
unsigned char buf[256];
u16 ubuf[256];
- size_t in, in2, out;
+ size_t in, out;

if (!synth)
return -ENODEV;
@@ -58,57 +58,24 @@ static ssize_t speakup_file_writeu(struct file *fp, const char __user *buffer,
return -EFAULT;

/* Convert to u16 */
- for (in = 0, out = 0; in < bytes; in++) {
- unsigned char c = buf[in];
- int nbytes = 8 - fls(c ^ 0xff);
- u32 value;
-
- switch (nbytes) {
- case 8: /* 0xff */
- case 7: /* 0xfe */
- case 1: /* 0x80 */
- /* Invalid, drop */
- goto drop;
-
- case 0:
- /* ASCII, copy */
- ubuf[out++] = c;
- continue;
+ for (in = 0, out = 0; in < bytes; in += consumed) {
+ s32 value;

- default:
- /* 2..6-byte UTF-8 */
+ value = synth_utf8_get(buf + in, bytes - in, &consumed, &want);
+ if (value == -1) {
+ /* Invalid or incomplete */

- if (bytes - in < nbytes) {
+ if (want > bytes - in)
/* We don't have it all yet, stop here
* and wait for the rest
*/
bytes = in;
- want = nbytes;
- continue;
- }
-
- /* First byte */
- value = c & ((1u << (7 - nbytes)) - 1);
-
- /* Other bytes */
- for (in2 = 2; in2 <= nbytes; in2++) {
- c = buf[in + 1];
- if ((c & 0xc0) != 0x80) {
- /* Invalid, drop the head */
- want = 1;
- goto drop;
- }
- value = (value << 6) | (c & 0x3f);
- in++;
- }
-
- if (value < 0x10000)
- ubuf[out++] = value;
- want = 1;
- break;
+
+ continue;
}
-drop:
- /* empty statement */;
+
+ if (value < 0x10000)
+ ubuf[out++] = value;
}

count -= bytes;
diff --git a/drivers/accessibility/speakup/speakup.h b/drivers/accessibility/speakup/speakup.h
index 364fde99749e..54f1226ea061 100644
--- a/drivers/accessibility/speakup/speakup.h
+++ b/drivers/accessibility/speakup/speakup.h
@@ -76,7 +76,9 @@ int speakup_paste_selection(struct tty_struct *tty);
void speakup_cancel_paste(void);
void speakup_register_devsynth(void);
void speakup_unregister_devsynth(void);
+s32 synth_utf8_get(const char *buf, size_t count, size_t *consumed, size_t *want);
void synth_write(const char *buf, size_t count);
+void synth_writeu(const char *buf, size_t count);
int synth_supports_indexing(void);

extern struct vc_data *spk_sel_cons;
diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c
index 45f906103133..85062e605d79 100644
--- a/drivers/accessibility/speakup/synth.c
+++ b/drivers/accessibility/speakup/synth.c
@@ -217,10 +217,95 @@ void synth_write(const char *_buf, size_t count)
synth_start();
}

+/* Consume one utf-8 character from buf (that contains up to count bytes),
+ * returns the unicode codepoint if valid, -1 otherwise.
+ * In all cases, returns the number of consumed bytes in *consumed,
+ * and the minimum number of bytes that would be needed for the next character
+ * in *want.
+ */
+s32 synth_utf8_get(const char *buf, size_t count, size_t *consumed, size_t *want)
+{
+ unsigned char c = buf[0];
+ int nbytes = 8 - fls(c ^ 0xff);
+ u32 value;
+ size_t i;
+
+ switch (nbytes) {
+ case 8: /* 0xff */
+ case 7: /* 0xfe */
+ case 1: /* 0x80 */
+ /* Invalid, drop */
+ *consumed = 1;
+ *want = 1;
+ return -1;
+
+ case 0:
+ /* ASCII, take as such */
+ *consumed = 1;
+ *want = 1;
+ return c;
+
+ default:
+ /* 2..6-byte UTF-8 */
+
+ if (count < nbytes) {
+ /* We don't have it all */
+ *consumed = 0;
+ *want = nbytes;
+ return -1;
+ }
+
+ /* First byte */
+ value = c & ((1u << (7 - nbytes)) - 1);
+
+ /* Other bytes */
+ for (i = 1; i < nbytes; i++) {
+ c = buf[i];
+ if ((c & 0xc0) != 0x80) {
+ /* Invalid, drop the head */
+ *consumed = i;
+ *want = 1;
+ return -1;
+ }
+ value = (value << 6) | (c & 0x3f);
+ }
+
+ *consumed = nbytes;
+ *want = 1;
+ return value;
+ }
+}
+
+void synth_writeu(const char *buf, size_t count)
+{
+ size_t i, consumed, want;
+
+ /* Convert to u16 */
+ for (i = 0; i < count; i++) {
+ s32 value;
+
+ value = synth_utf8_get(buf + i, count - i, &consumed, &want);
+ if (value == -1) {
+ /* Invalid or incomplete */
+
+ if (want > count - i)
+ /* We don't have it all, stop */
+ count = i;
+
+ continue;
+ }
+
+ if (value < 0x10000)
+ synth_buffer_add(value);
+ }
+
+ synth_start();
+}
+
void synth_printf(const char *fmt, ...)
{
va_list args;
- unsigned char buf[160], *p;
+ unsigned char buf[160];
int r;

va_start(args, fmt);
@@ -229,10 +314,7 @@ void synth_printf(const char *fmt, ...)
if (r > sizeof(buf) - 1)
r = sizeof(buf) - 1;

- p = buf;
- while (r--)
- synth_buffer_add(*p++);
- synth_start();
+ synth_writeu(buf, r);
}
EXPORT_SYMBOL_GPL(synth_printf);

--
2.43.0