[RFC PATCH 18/20] cifs: Do not use broken utf8 NLS table for iocharset=utf8 mount option

From: Pali Rohár
Date: Sun Aug 08 2021 - 12:25:45 EST


NLS table for utf8 is broken and cannot be fixed.

So instead of broken utf8 nls functions char2uni() and uni2char() use
functions utf8s_to_utf16s() and utf16s_to_utf8s() which implements correct
conversion between UTF-16 and UTF-8.

When iochatset=utf8 is used then set ctx->iocharset to NULL and use it for
distinguish between the fact if NLS table or native UTF-8 functions should
be used.

Signed-off-by: Pali Rohár <pali@xxxxxxxxxx>
---
fs/cifs/cifs_unicode.c | 128 +++++++++++++++++++++++++++--------------
fs/cifs/cifs_unicode.h | 2 +-
fs/cifs/cifsfs.c | 2 +
fs/cifs/connect.c | 8 ++-
fs/cifs/dir.c | 28 +++++++--
fs/cifs/winucase.c | 14 +++--
6 files changed, 124 insertions(+), 58 deletions(-)

diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 9bd03a231032..b0f7f78da7c2 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -131,20 +131,17 @@ cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
convert_sfu_char(src_char, target))
return len;

- /* if character not one of seven in special remap set */
- len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
- if (len <= 0)
- goto surrogate_pair;
-
- return len;
+ if (cp) {
+ /* if character not one of seven in special remap set */
+ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
+ if (len <= 0)
+ goto unknown;
+ } else {
+ len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
+ if (len <= 0)
+ goto unknown;
+ }

-surrogate_pair:
- /* convert SURROGATE_PAIR and IVS */
- if (strcmp(cp->charset, "utf8"))
- goto unknown;
- len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
- if (len <= 0)
- goto unknown;
return len;

unknown:
@@ -240,6 +237,37 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
return outlen;
}

+static int cifs_utf8s_to_utf16s(const char *s, int inlen, __le16 *pwcs)
+{
+ __le16 *op;
+ int size;
+ unicode_t u;
+
+ op = pwcs;
+ while (inlen > 0 && *s) {
+ if (*s & 0x80) {
+ size = utf8_to_utf32(s, inlen, &u);
+ if (size <= 0) {
+ u = 0x003f; /* A question mark */
+ size = 1;
+ }
+ s += size;
+ inlen -= size;
+ if (u >= 0x10000) {
+ u -= 0x10000;
+ *op++ = __cpu_to_le16(0xd800 | ((u >> 10) & 0x03ff));
+ *op++ = __cpu_to_le16(0xdc00 | (u & 0x03ff));
+ } else {
+ *op++ = __cpu_to_le16(u);
+ }
+ } else {
+ *op++ = __cpu_to_le16(*s++);
+ inlen--;
+ }
+ }
+ return op - pwcs;
+}
+
/*
* NAME: cifs_strtoUTF16()
*
@@ -255,24 +283,14 @@ cifs_strtoUTF16(__le16 *to, const char *from, int len,
wchar_t wchar_to; /* needed to quiet sparse */

/* special case for utf8 to handle no plane0 chars */
- if (!strcmp(codepage->charset, "utf8")) {
+ if (!codepage) {
/*
* convert utf8 -> utf16, we assume we have enough space
* as caller should have assumed conversion does not overflow
- * in destination len is length in wchar_t units (16bits)
- */
- i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
- (wchar_t *) to, len);
-
- /* if success terminate and exit */
- if (i >= 0)
- goto success;
- /*
- * if fails fall back to UCS encoding as this
- * function should not return negative values
- * currently can fail only if source contains
- * invalid encoded characters
+ * in destination len is length in __le16 units
*/
+ i = cifs_utf8s_to_utf16s(from, len, to);
+ goto success;
}

for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
@@ -508,25 +526,29 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
* as they use backslash as separator.
*/
if (dst_char == 0) {
- charlen = cp->char2uni(source + i, srclen - i, &tmp);
- dst_char = cpu_to_le16(tmp);
-
- /*
- * if no match, use question mark, which at least in
- * some cases serves as wild card
- */
- if (charlen > 0)
- goto ctoUTF16;
-
- /* convert SURROGATE_PAIR */
- if (strcmp(cp->charset, "utf8") || !wchar_to)
- goto unknown;
- if (*(source + i) & 0x80) {
- charlen = utf8_to_utf32(source + i, 6, &u);
- if (charlen < 0)
+ if (cp) {
+ charlen = cp->char2uni(source + i, srclen - i, &tmp);
+ dst_char = cpu_to_le16(tmp);
+
+ /*
+ * if no match, use question mark, which at least in
+ * some cases serves as wild card
+ */
+ if (charlen > 0)
+ goto ctoUTF16;
+ else
goto unknown;
- } else
+ }
+
+ /* UTF-8 to UTF-16 conversion */
+
+ if (!wchar_to)
goto unknown;
+
+ charlen = utf8_to_utf32(source + i, 6, &u);
+ if (charlen < 0)
+ goto unknown;
+
ret = utf8s_to_utf16s(source + i, charlen,
UTF16_LITTLE_ENDIAN,
wchar_to, 6);
@@ -595,8 +617,26 @@ cifs_local_to_utf16_bytes(const char *from, int len,
{
int charlen;
int i;
+ int outlen;
+ unicode_t u_to;
wchar_t wchar_to;

+ if (!codepage) {
+ outlen = 0;
+ for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+ charlen = utf8_to_utf32(from, len, &u_to);
+ /* Failed conversion defaults to a question mark */
+ if (charlen < 1) {
+ charlen = 1;
+ outlen += 2;
+ } else if (u_to <= 0xFFFF)
+ outlen += 2;
+ else
+ outlen += 4;
+ }
+ return outlen;
+ }
+
for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
charlen = codepage->char2uni(from, len, &wchar_to);
/* Failed conversion defaults to a question mark */
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 80b3d845419f..b9a3290faaf7 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -106,7 +106,7 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
int remap);
#endif

-wchar_t cifs_toupper(wchar_t in);
+unicode_t cifs_toupper(unicode_t in);

/*
* UniStrcat: Concatenate the second string to the first
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 64b71c4e2a9d..9941bb6f2aad 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -569,6 +569,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_sb->ctx->dir_mode);
if (cifs_sb->ctx->iocharset)
seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset);
+ else
+ seq_puts(s, ",iocharset=utf8");
if (tcon->seal)
seq_puts(s, ",seal");
else if (tcon->ses->server->ignore_signature)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3781eee9360a..d560fb7a9aed 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2338,7 +2338,11 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
old->ctx->dir_mode != new->ctx->dir_mode)
return 0;

- if (strcmp(old->local_nls->charset, new->local_nls->charset))
+ if (old->local_nls && !new->local_nls)
+ return 0;
+ if (!old->local_nls && new->local_nls)
+ return 0;
+ if (old->local_nls && new->local_nls && strcmp(old->local_nls->charset, new->local_nls->charset))
return 0;

if (old->ctx->acregmax != new->ctx->acregmax)
@@ -2800,7 +2804,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
if (ctx->iocharset == NULL) {
/* load_nls_default cannot return null */
cifs_sb->local_nls = load_nls_default();
- } else {
+ } else if (strcmp(ctx->iocharset, "utf8") != 0) {
cifs_sb->local_nls = load_nls(ctx->iocharset);
if (cifs_sb->local_nls == NULL) {
cifs_dbg(VFS, "CIFS mount error: iocharset %s not found\n",
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 79402ca0ddfa..fa09fb5d3641 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -789,16 +789,22 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
{
struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
unsigned long hash;
+ unicode_t u;
wchar_t c;
int i, charlen;

hash = init_name_hash(dentry);
for (i = 0; i < q->len; i += charlen) {
- charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+ if (codepage) {
+ charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+ if (likely(charlen > 0))
+ u = c;
+ } else
+ charlen = utf8_to_utf32(&q->name[i], q->len - i, &u);
/* error out if we can't convert the character */
if (unlikely(charlen < 0))
return charlen;
- hash = partial_name_hash(cifs_toupper(c), hash);
+ hash = partial_name_hash(cifs_toupper(u), hash);
}
q->hash = end_name_hash(hash);

@@ -809,6 +815,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
+ unicode_t u1, u2;
wchar_t c1, c2;
int i, l1, l2;

@@ -822,9 +829,18 @@ static int cifs_ci_compare(const struct dentry *dentry,
return 1;

for (i = 0; i < len; i += l1) {
- /* Convert characters in both strings to UTF-16. */
- l1 = codepage->char2uni(&str[i], len - i, &c1);
- l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+ /* Convert characters in both strings to UTF-32. */
+ if (codepage) {
+ l1 = codepage->char2uni(&str[i], len - i, &c1);
+ l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+ if (likely(l1 > 0))
+ u1 = c1;
+ if (likely(l2 > 0))
+ u2 = c2;
+ } else {
+ l1 = utf8_to_utf32(&str[i], len - i, &u1);
+ l2 = utf8_to_utf32(&name->name[i], name->len - i, &u2);
+ }

/*
* If we can't convert either character, just declare it to
@@ -845,7 +861,7 @@ static int cifs_ci_compare(const struct dentry *dentry,
return 1;

/* Now compare uppercase versions of these characters */
- if (cifs_toupper(c1) != cifs_toupper(c2))
+ if (cifs_toupper(u1) != cifs_toupper(u2))
return 1;
}

diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
index 59b6c577aa0a..fce38de59e13 100644
--- a/fs/cifs/winucase.c
+++ b/fs/cifs/winucase.c
@@ -18,7 +18,7 @@

#include <linux/nls.h>

-wchar_t cifs_toupper(wchar_t in); /* quiet sparse */
+unicode_t cifs_toupper(unicode_t in); /* quiet sparse */

static const wchar_t t2_00[256] = {
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
@@ -616,20 +616,24 @@ static const wchar_t *const toplevel[256] = {
};

/**
- * cifs_toupper - convert a wchar_t from lower to uppercase
+ * cifs_toupper - convert a unicode_t from lower to uppercase
* @in: character to convert from lower to uppercase
*
- * This function consults the static tables above to convert a wchar_t from
+ * This function consults the static tables above to convert a unicode_t from
* lower to uppercase. In the event that there is no mapping, the original
* "in" character is returned.
*/
-wchar_t
-cifs_toupper(wchar_t in)
+unicode_t
+cifs_toupper(unicode_t in)
{
unsigned char idx;
const wchar_t *tbl;
wchar_t out;

+ /* cifs_toupper table has only defines for plane-0 */
+ if (in > 0xffff)
+ return in;
+
/* grab upper byte */
idx = (in & 0xff00) >> 8;

--
2.20.1