Crude support for UTF8/UTF16 ID3v2 tags

From: Petr Baudis <pasky@ucw.cz>

This patch implements this in a minimalistic way - ISO-8859-1 characters
(which coincide with the first 256 Unicode codepoints) will be shown
on the terminal (except in case of UTF8 which is ASCII-only) and other
characters will get transcribed as '*'. This fixes especially horrible
output for UTF16 tags.

In the future, locale should take account in choosing the output charset,
but this still makes it better than before, I think. Also, iconv() should
be used, but not all systems mpg123 wants to run on apparently have usable
iconv() so this code will still have use for those.
---

 src/id3.c |  127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/src/id3.c b/src/id3.c
index 13cbc22..571cfea 100644
--- a/src/id3.c
+++ b/src/id3.c
@@ -52,13 +52,120 @@ void reset_id3()
 	id3.genre.fill = 0;
 }
 
+/* Text decoder decodes the ID3 text content from whatever encoding to
+ * ISO-8859-1 or ASCII, substituting unconvertable characters with '*'
+ * and returning the final length of decoded string. */
+/* TODO: iconv() to whatever locale. But we will want to keep this code anyway
+ * for systems w/o iconv(). But we currently assume that it is enough to
+ * allocate @len bytes in dest. That might not be true when converting to
+ * Unicode encodings. */
+typedef int (*text_decoder)(char* dest, unsigned char* source, int len);
+
+static int decode_il1(char* dest, unsigned char* source, int len)
+{
+	memcpy(dest, source, len);
+	return len;
+}
+
+static int decode_utf16(char* dest, unsigned char* source, int len, int str_be)
+{
+	int spos = 0;
+	int dlen = 0;
+
+	len -= len % 2;
+	/* Just ASCII, we take it easy. */
+	for (; spos < len; spos += 2) {
+		int word;
+		if (str_be)
+			word = source[spos] << 8 | source[spos+1];
+		else
+			word = source[spos] | source[spos+1] << 8;
+		if (word & 0xdc00) /* utf16 continuation byte */
+			continue;
+		else if (word > 255) /* utf16 out-of-range codepoint */
+			dest[dlen++] = '*';
+		else
+			dest[dlen++] = word;
+	}
+	return dlen;
+}
+
+static int decode_utf16bom(char* dest, unsigned char* source, int len)
+{
+	if (len < 2)
+		return 0;
+	if (source[0] == 0xFF && source[1] == 0xFE)
+		/* Little-endian */
+		return decode_utf16(dest, source + 2, len - 2, 0);
+	else
+		/* Big-endian */
+		return decode_utf16(dest, source + 2, len - 2, 1);
+}
+
+static int decode_utf16be(char* dest, unsigned char* source, int len)
+{
+	return decode_utf16(dest, source, len, 1);
+}
+
+static int decode_utf8(char* dest, unsigned char* source, int len)
+{
+	int spos = 0;
+	int dlen = 0;
+	/* Just ASCII, we take it easy. */
+	for (; spos < len; spos++) {
+		if (source[spos] & 0xc0) /* utf8 continuation byte */
+			continue;
+		else if (source[spos] & 0x80) /* utf8 lead byte */
+			dest[dlen++] = '*';
+		else
+			dest[dlen++] = source[spos];
+	}
+	return dlen;
+}
+
+static text_decoder text_decoders[4] = {
+	decode_il1, decode_utf16bom, decode_utf16be, decode_utf8
+};
+static int encoding_widths[4] = {
+	1, 2, 2, 1
+};
+
+/* determine byte length of string with characters wide @width;
+ * terminating 0 will be included, too, if there is any */
+int wide_bytelen(int width, char* string, size_t string_size)
+{
+	size_t l = 0;
+	while (l < string_size) {
+		int b;
+		for (b = 0; b < width; b++)
+			if (string[l + b])
+				break;
+		l += width;
+		if (b == width) /* terminating zero */
+			return l;
+	}
+	return l;
+
+}
+
 void store_id3_text(struct stringbuf* sb, char* source, size_t source_size)
 {
 	size_t pos = 1; /* skipping the encoding */
+	int encoding;
+	int bwidth;
 	if(! source_size) return;
-	if(!(source[0] == 0 || source[0] == 3))
+	encoding = source[0];
+	if(encoding > 3)
 	{
-		warning("Not ISO8859-1 or UTF8 encoding of text - I will probably screw a bit up!");
+		warning1("Unknown text encoding %d, assuming ISO8859-1 - I will probably screw a bit up!", encoding);
+		encoding = 0;
+	}
+	bwidth = encoding_widths[encoding];
+	if((source_size-1) % bwidth)
+	{
+		/* Uh. (BTW, the -1 is for the encoding byte.) */
+		warning2("Weird tag size %d for encoding %d - I will probably trim too early or something but I think the MP3 is broken.", source_size, encoding);
+		source_size -= (source_size-1) % bwidth;
 	}
 	/*
 		first byte: Text encoding          $xx
@@ -68,24 +175,16 @@ void store_id3_text(struct stringbuf* sb
 	*/
 	while(pos < source_size)
 	{
-		/* determine length of string, 0 will be stored, too */
-		size_t l = strlen(source+pos)+1;
-		if(pos+l > source_size) l = source_size - pos + 1; /* not null-terminated... */
+		size_t l = wide_bytelen(bwidth, source+pos, source_size-pos);
 		if((sb->size >= sb->fill+l) || resize_stringbuf(sb, sb->fill+l))
 		{
 			/* append with line break */
 			if(sb->fill) sb->p[sb->fill-1] = '\n';
-			/* do not copy the ending 0 since it may not be there */
-			memcpy(sb->p+sb->fill, source+pos, l-1);
-			sb->fill += l;
-			sb->p[sb->fill-1] = 0;
+			/* do not include the ending 0 in the conversion */
+			sb->fill += text_decoders[encoding](sb->p+sb->fill, (unsigned char *) source+pos, l-(source_size==pos+l ? 0 : bwidth));
+			sb->p[sb->fill++] = 0;
 			/* advance to beginning of next string */
 			pos += l;
-			while(pos < source_size && source[pos] == 0)
-			{
-				/* an additonal null could mean that we are dealing with unicode... */
-				++pos;
-			}
 		}
 		else break;
 	}