[PATCH v2 7/7] udf: Merge linux specific translation into CS0 conversion function

From: Andrew Gabbasov
Date: Thu Dec 24 2015 - 11:27:24 EST


Current implementation of udf_translate_to_linux function does not
support multi-bytes characters at all: it counts bytes while calculating
extension length, when inserting CRC inside the name it doesn't
take into account inter-character boundaries and can break into
the middle of the character.

The most efficient way to properly support multi-bytes characters is
merging of translation operations directly into conversion function.
This can help to avoid extra passes along the string or parsing
the multi-bytes character back into unicode to find out it's length.

Signed-off-by: Andrew Gabbasov <andrew_gabbasov@xxxxxxxxxx>
---
fs/udf/unicode.c | 260 ++++++++++++++++++++++++++++++-------------------------
1 file changed, 141 insertions(+), 119 deletions(-)

diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index f1cdeac..1dc967d 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,9 +28,6 @@

#include "udf_sb.h"

-static int udf_translate_to_linux(uint8_t *, int, const uint8_t *, int,
- const uint8_t *, int);
-
static int udf_uni2char_utf8(wchar_t uni,
unsigned char *out,
int boundlen)
@@ -114,13 +111,32 @@ static int udf_char2uni_utf8(const unsigned char *in,
return u_len;
}

+#define ILLEGAL_CHAR_MARK '_'
+#define EXT_MARK '.'
+#define CRC_MARK '#'
+#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
+
static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
const uint8_t *ocu, int ocu_len,
- int (*conv_f)(wchar_t, unsigned char *, int))
+ int (*conv_f)(wchar_t, unsigned char *, int),
+ int translate)
{
+ uint32_t c;
uint8_t cmp_id;
int i, len;
- int str_o_len = 0;
+ int u_ch;
+ int firstDots = 0, needsCRC = 0, illChar;
+ int ext_i_len, ext_max_len;
+ int str_o_len = 0; /* Length of resulting output */
+ int ext_o_len = 0; /* Extension output length */
+ int ext_crc_len = 0; /* Extension output length if used with CRC */
+ int i_ext = -1; /* Extension position in input buffer */
+ int o_crc = 0; /* Rightmost possible output pos for CRC+ext */
+ unsigned short valueCRC;
+ uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
+ uint8_t crc[CRC_LEN];

if (str_max_len <= 0)
return 0;
@@ -133,22 +149,134 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
cmp_id = ocu[0];
if (cmp_id != 8 && cmp_id != 16) {
memset(str_o, 0, str_max_len);
- pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu);
+ pr_err("unknown compression code (%d)\n", cmp_id);
return -EINVAL;
}
+ u_ch = cmp_id >> 3;
+
+ ocu++;
+ ocu_len--;
+
+ if (translate) {
+ /* Look for extension */
+ for (i = (ocu_len & ~(u_ch - 1)) - u_ch, ext_i_len = 0;
+ (i >= 0) && (ext_i_len < EXT_SIZE);
+ i -= u_ch, ext_i_len++) {
+
+ c = ocu[i];
+ if (u_ch > 1)
+ c = (c << 8) | ocu[i + 1];
+
+ if (c == EXT_MARK) {
+ if (ext_i_len)
+ i_ext = i;
+ break;
+ }
+ }
+ if (i_ext >= 0) {
+ /* Convert extension */
+ ext_max_len = min_t(int, sizeof(ext), str_max_len);
+ ext[ext_o_len++] = EXT_MARK;
+ illChar = 0;
+ for (i = i_ext + u_ch; i < ocu_len;) {
+
+ c = ocu[i++];
+ if (u_ch > 1)
+ c = (c << 8) | ocu[i++];
+
+ if (c == '/' || c == 0) {
+ if (illChar)
+ continue;
+ illChar = 1;
+ needsCRC = 1;
+ c = ILLEGAL_CHAR_MARK;
+ } else {
+ illChar = 0;
+ }
+
+ len = conv_f(c, &ext[ext_o_len],
+ ext_max_len - ext_o_len);
+ /* Valid character? */
+ if (len >= 0) {
+ ext_o_len += len;
+ } else {
+ ext[ext_o_len++] = '?';
+ needsCRC = 1;
+ }
+ if ((ext_o_len + CRC_LEN) < str_max_len)
+ ext_crc_len = ext_o_len;
+ }
+ }
+ }
+
+ illChar = 0;
+ for (i = 0; i < ocu_len;) {
+
+ if (str_o_len >= str_max_len) {
+ needsCRC = 1;
+ break;
+ }
+
+ if (translate && (i == i_ext)) {
+ if (str_o_len > (str_max_len - ext_o_len))
+ needsCRC = 1;
+ break;
+ }

- for (i = 1; (i < ocu_len) && (str_o_len < str_max_len);) {
/* Expand OSTA compressed Unicode to Unicode */
- uint32_t c = ocu[i++];
- if (cmp_id == 16)
+ c = ocu[i++];
+ if (u_ch > 1)
c = (c << 8) | ocu[i++];

+ if (translate) {
+ if ((c == '.') && (firstDots >= 0))
+ firstDots++;
+ else
+ firstDots = -1;
+
+ if (c == '/' || c == 0) {
+ if (illChar)
+ continue;
+ illChar = 1;
+ needsCRC = 1;
+ c = ILLEGAL_CHAR_MARK;
+ } else {
+ illChar = 0;
+ }
+ }
+
len = conv_f(c, &str_o[str_o_len], str_max_len - str_o_len);
/* Valid character? */
- if (len >= 0)
+ if (len >= 0) {
str_o_len += len;
- else
+ } else {
str_o[str_o_len++] = '?';
+ needsCRC = 1;
+ }
+ if (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))
+ o_crc = str_o_len;
+ }
+
+ if (translate) {
+ if ((firstDots == 1) || (firstDots == 2))
+ needsCRC = 1;
+ if (needsCRC) {
+ str_o_len = o_crc;
+ valueCRC = crc_itu_t(0, ocu, ocu_len);
+ crc[0] = CRC_MARK;
+ crc[1] = hex_asc_upper_hi(valueCRC >> 8);
+ crc[2] = hex_asc_upper_lo(valueCRC >> 8);
+ crc[3] = hex_asc_upper_hi(valueCRC);
+ crc[4] = hex_asc_upper_lo(valueCRC);
+ len = min_t(int, CRC_LEN, str_max_len - str_o_len);
+ memcpy(&str_o[str_o_len], crc, len);
+ str_o_len += len;
+ ext_o_len = ext_crc_len;
+ }
+ if (ext_o_len > 0) {
+ memcpy(&str_o[str_o_len], ext, ext_o_len);
+ str_o_len += ext_o_len;
+ }
}

return str_o_len;
@@ -202,13 +330,12 @@ try_again:
int udf_CS0toUTF8(uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len)
{
return udf_name_from_CS0(utf_o, o_len, ocu_i, i_len,
- udf_uni2char_utf8);
+ udf_uni2char_utf8, 0);
}

int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
- uint8_t *filename;
int (*conv_f)(wchar_t, unsigned char *, int);
int ret;

@@ -218,10 +345,6 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
if (dlen <= 0)
return 0;

- filename = kmalloc(dlen, GFP_NOFS);
- if (!filename)
- return -ENOMEM;
-
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
conv_f = udf_uni2char_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
@@ -229,18 +352,10 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
} else
BUG();

- ret = udf_name_from_CS0(filename, dlen, sname, slen, conv_f);
- if (ret < 0) {
- udf_debug("Failed in udf_get_filename: sname = %s\n", sname);
- goto out2;
- }
-
- ret = udf_translate_to_linux(dname, dlen, filename, dlen, sname, slen);
+ ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
/* Zero length filename isn't valid... */
if (ret == 0)
ret = -EINVAL;
-out2:
- kfree(filename);
return ret;
}

@@ -259,96 +374,3 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
}

-#define ILLEGAL_CHAR_MARK '_'
-#define EXT_MARK '.'
-#define CRC_MARK '#'
-#define EXT_SIZE 5
-/* Number of chars we need to store generated CRC to make filename unique */
-#define CRC_LEN 5
-
-static int udf_translate_to_linux(uint8_t *newName, int newLen,
- const uint8_t *udfName, int udfLen,
- const uint8_t *fidName, int fidNameLen)
-{
- int index, newIndex = 0, needsCRC = 0;
- int extIndex = 0, newExtIndex = 0, hasExt = 0;
- unsigned short valueCRC;
- uint8_t curr;
-
- if (udfName[0] == '.' &&
- (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
- needsCRC = 1;
- newIndex = udfLen;
- memcpy(newName, udfName, udfLen);
- } else {
- for (index = 0; index < udfLen; index++) {
- curr = udfName[index];
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (index + 1 < udfLen &&
- (udfName[index + 1] == '/' ||
- udfName[index + 1] == 0))
- index++;
- }
- if (curr == EXT_MARK &&
- (udfLen - index - 1) <= EXT_SIZE) {
- if (udfLen == index + 1)
- hasExt = 0;
- else {
- hasExt = 1;
- extIndex = index;
- newExtIndex = newIndex;
- }
- }
- if (newIndex < newLen)
- newName[newIndex++] = curr;
- else
- needsCRC = 1;
- }
- }
- if (needsCRC) {
- uint8_t ext[EXT_SIZE];
- int localExtIndex = 0;
-
- if (hasExt) {
- int maxFilenameLen;
- for (index = 0;
- index < EXT_SIZE && extIndex + index + 1 < udfLen;
- index++) {
- curr = udfName[extIndex + index + 1];
-
- if (curr == '/' || curr == 0) {
- needsCRC = 1;
- curr = ILLEGAL_CHAR_MARK;
- while (extIndex + index + 2 < udfLen &&
- (index + 1 < EXT_SIZE &&
- (udfName[extIndex + index + 2] == '/' ||
- udfName[extIndex + index + 2] == 0)))
- index++;
- }
- ext[localExtIndex++] = curr;
- }
- maxFilenameLen = newLen - CRC_LEN - localExtIndex;
- if (newIndex > maxFilenameLen)
- newIndex = maxFilenameLen;
- else
- newIndex = newExtIndex;
- } else if (newIndex > newLen - CRC_LEN)
- newIndex = newLen - CRC_LEN;
- newName[newIndex++] = CRC_MARK;
- valueCRC = crc_itu_t(0, fidName, fidNameLen);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
- newName[newIndex++] = hex_asc_upper_hi(valueCRC);
- newName[newIndex++] = hex_asc_upper_lo(valueCRC);
-
- if (hasExt) {
- newName[newIndex++] = EXT_MARK;
- for (index = 0; index < localExtIndex; index++)
- newName[newIndex++] = ext[index];
- }
- }
-
- return newIndex;
-}
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/