Crude support for UTF8/UTF16 ID3v2 tags From: Petr Baudis This patch implements this in a minimalistic way - ISO-8859-1 characters (which coincide with the first 256 Unicode codepoints) will be shown on the terminal (except in case of UTF8 which is ASCII-only) and other characters will get transcribed as '*'. This fixes especially horrible output for UTF16 tags. In the future, locale should take account in choosing the output charset, but this still makes it better than before, I think. Also, iconv() should be used, but not all systems mpg123 wants to run on apparently have usable iconv() so this code will still have use for those. --- src/id3.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 113 insertions(+), 14 deletions(-) diff --git a/src/id3.c b/src/id3.c index 13cbc22..571cfea 100644 --- a/src/id3.c +++ b/src/id3.c @@ -52,13 +52,120 @@ void reset_id3() id3.genre.fill = 0; } +/* Text decoder decodes the ID3 text content from whatever encoding to + * ISO-8859-1 or ASCII, substituting unconvertable characters with '*' + * and returning the final length of decoded string. */ +/* TODO: iconv() to whatever locale. But we will want to keep this code anyway + * for systems w/o iconv(). But we currently assume that it is enough to + * allocate @len bytes in dest. That might not be true when converting to + * Unicode encodings. */ +typedef int (*text_decoder)(char* dest, unsigned char* source, int len); + +static int decode_il1(char* dest, unsigned char* source, int len) +{ + memcpy(dest, source, len); + return len; +} + +static int decode_utf16(char* dest, unsigned char* source, int len, int str_be) +{ + int spos = 0; + int dlen = 0; + + len -= len % 2; + /* Just ASCII, we take it easy. */ + for (; spos < len; spos += 2) { + int word; + if (str_be) + word = source[spos] << 8 | source[spos+1]; + else + word = source[spos] | source[spos+1] << 8; + if (word & 0xdc00) /* utf16 continuation byte */ + continue; + else if (word > 255) /* utf16 out-of-range codepoint */ + dest[dlen++] = '*'; + else + dest[dlen++] = word; + } + return dlen; +} + +static int decode_utf16bom(char* dest, unsigned char* source, int len) +{ + if (len < 2) + return 0; + if (source[0] == 0xFF && source[1] == 0xFE) + /* Little-endian */ + return decode_utf16(dest, source + 2, len - 2, 0); + else + /* Big-endian */ + return decode_utf16(dest, source + 2, len - 2, 1); +} + +static int decode_utf16be(char* dest, unsigned char* source, int len) +{ + return decode_utf16(dest, source, len, 1); +} + +static int decode_utf8(char* dest, unsigned char* source, int len) +{ + int spos = 0; + int dlen = 0; + /* Just ASCII, we take it easy. */ + for (; spos < len; spos++) { + if (source[spos] & 0xc0) /* utf8 continuation byte */ + continue; + else if (source[spos] & 0x80) /* utf8 lead byte */ + dest[dlen++] = '*'; + else + dest[dlen++] = source[spos]; + } + return dlen; +} + +static text_decoder text_decoders[4] = { + decode_il1, decode_utf16bom, decode_utf16be, decode_utf8 +}; +static int encoding_widths[4] = { + 1, 2, 2, 1 +}; + +/* determine byte length of string with characters wide @width; + * terminating 0 will be included, too, if there is any */ +int wide_bytelen(int width, char* string, size_t string_size) +{ + size_t l = 0; + while (l < string_size) { + int b; + for (b = 0; b < width; b++) + if (string[l + b]) + break; + l += width; + if (b == width) /* terminating zero */ + return l; + } + return l; + +} + void store_id3_text(struct stringbuf* sb, char* source, size_t source_size) { size_t pos = 1; /* skipping the encoding */ + int encoding; + int bwidth; if(! source_size) return; - if(!(source[0] == 0 || source[0] == 3)) + encoding = source[0]; + if(encoding > 3) { - warning("Not ISO8859-1 or UTF8 encoding of text - I will probably screw a bit up!"); + warning1("Unknown text encoding %d, assuming ISO8859-1 - I will probably screw a bit up!", encoding); + encoding = 0; + } + bwidth = encoding_widths[encoding]; + if((source_size-1) % bwidth) + { + /* Uh. (BTW, the -1 is for the encoding byte.) */ + warning2("Weird tag size %d for encoding %d - I will probably trim too early or something but I think the MP3 is broken.", source_size, encoding); + source_size -= (source_size-1) % bwidth; } /* first byte: Text encoding $xx @@ -68,24 +175,16 @@ void store_id3_text(struct stringbuf* sb */ while(pos < source_size) { - /* determine length of string, 0 will be stored, too */ - size_t l = strlen(source+pos)+1; - if(pos+l > source_size) l = source_size - pos + 1; /* not null-terminated... */ + size_t l = wide_bytelen(bwidth, source+pos, source_size-pos); if((sb->size >= sb->fill+l) || resize_stringbuf(sb, sb->fill+l)) { /* append with line break */ if(sb->fill) sb->p[sb->fill-1] = '\n'; - /* do not copy the ending 0 since it may not be there */ - memcpy(sb->p+sb->fill, source+pos, l-1); - sb->fill += l; - sb->p[sb->fill-1] = 0; + /* do not include the ending 0 in the conversion */ + sb->fill += text_decoders[encoding](sb->p+sb->fill, (unsigned char *) source+pos, l-(source_size==pos+l ? 0 : bwidth)); + sb->p[sb->fill++] = 0; /* advance to beginning of next string */ pos += l; - while(pos < source_size && source[pos] == 0) - { - /* an additonal null could mean that we are dealing with unicode... */ - ++pos; - } } else break; }