From cee834154508f1e6ff8f7f2d2f79ec840545ac0f Mon Sep 17 00:00:00 2001 From: Klaus Schmidinger Date: Tue, 21 Aug 2012 08:23:13 +0200 Subject: Fixed handling control characters in SI data in case of UTF-8 encoded strings --- CONTRIBUTORS | 3 ++ HISTORY | 5 ++- libsi/si.c | 134 ++++++++++++++++++++++++++++++----------------------------- 3 files changed, 76 insertions(+), 66 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6512bfcb..608251b2 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -2917,3 +2917,6 @@ Dirk Heiser Ludi Kaleni for suggesting to add the source character to channel names whenever they are displayed + +Mehdi Karamnejad + for reporting a problem with garbled UTF-8 EPG data and helping to debug it diff --git a/HISTORY b/HISTORY index 3b272931..70463b7d 100644 --- a/HISTORY +++ b/HISTORY @@ -7191,6 +7191,9 @@ Video Disk Recorder Revision History turn on adding the source character to channel names whenever they are displayed (suggested by Ludi Kaleni). -2012-07-15: Version 1.7.30 +2012-08-21: Version 1.7.30 - Fixed sorting recordings in the top level video directory. +- Fixed handling control characters in SI data in case of UTF-8 encoded strings + (thanks to Mehdi Karamnejad for reporting a problem with garbled UTF-8 EPG data + and helping to debug it). diff --git a/libsi/si.c b/libsi/si.c index 4db917be..cd889bcd 100644 --- a/libsi/si.c +++ b/libsi/si.c @@ -6,7 +6,7 @@ * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * - * $Id: si.c 2.6 2011/12/10 15:47:15 kls Exp $ + * $Id: si.c 2.7 2012/08/21 08:10:00 kls Exp $ * * ***************************************************************************/ @@ -405,6 +405,21 @@ bool convertCharacterTable(const char *from, size_t fromLength, char *to, size_t return false; } +// A similar version is used in VDR/tools.c: +static int Utf8CharLen(const char *s) +{ + if (SystemCharacterTableIsSingleByte) + return 1; +#define MT(s, m, v) ((*(s) & (m)) == (v)) // Mask Test + if (MT(s, 0xE0, 0xC0) && MT(s + 1, 0xC0, 0x80)) + return 2; + if (MT(s, 0xF0, 0xE0) && MT(s + 1, 0xC0, 0x80) && MT(s + 2, 0xC0, 0x80)) + return 3; + if (MT(s, 0xF8, 0xF0) && MT(s + 1, 0xC0, 0x80) && MT(s + 2, 0xC0, 0x80) && MT(s + 3, 0xC0, 0x80)) + return 4; + return 1; +} + // originally from libdtv, Copyright Rolf Hakenes void String::decodeText(char *buffer, int size) { const unsigned char *from=data.getData(0); @@ -413,82 +428,71 @@ void String::decodeText(char *buffer, int size) { if (len <= 0) { *to = '\0'; return; - } + } bool singleByte; const char *cs = getCharacterTable(from, len, &singleByte); - // FIXME Need to make this UTF-8 aware (different control codes). - // However, there's yet to be found a broadcaster that actually - // uses UTF-8 for the SI data... (kls 2007-06-10) - for (int i = 0; i < len; i++) { - if (*from == 0) - break; - if ( ((' ' <= *from) && (*from <= '~')) - || (*from == '\n') - || (0xA0 <= *from) - ) - *to++ = *from; - else if (*from == 0x8A) - *to++ = '\n'; - from++; - if (to - buffer >= size - 1) - break; + if (singleByte && SystemCharacterTableIsSingleByte || !convertCharacterTable((const char *)from, len, to, size, cs)) { + if (len >= size) + len = size - 1; + strncpy(to, (const char *)from, len); + to[len] = 0; } - *to = '\0'; - if (!singleByte || !SystemCharacterTableIsSingleByte) { - char convBuffer[size]; - if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs)) - strncpy(buffer, convBuffer, strlen(convBuffer) + 1); + else + len = strlen(to); // might have changed + // Handle control codes: + while (len > 0) { + int l = Utf8CharLen(to); + if (l <= 2) { + unsigned char *p = (unsigned char *)to; + if (l == 2 && *p == 0xC2) // UTF-8 sequence + p++; + bool Move = true; + switch (*p) { + case 0x8A: *to = '\n'; break; + case 0xA0: *to = ' '; break; + default: Move = false; + } + if (l == 2 && Move) { + memmove(p, p + 1, len - 1); // we also copy the terminating 0! + l = 1; + } + } + to += l; + len -= l; } } void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) { - const unsigned char *from=data.getData(0); + decodeText(buffer, sizeBuffer); + if (!*buffer) { + *shortVersion = '\0'; + return; + } + // Handle control codes: char *to=buffer; - char *toShort=shortVersion; + int len=strlen(to); int IsShortName=0; - int len=getLength(); - if (len <= 0) { - *to = '\0'; - *toShort = '\0'; - return; + while (len > 0) { + int l = Utf8CharLen(to); + unsigned char *p = (unsigned char *)to; + if (l == 2 && *p == 0xC2) // UTF-8 sequence + p++; + if (*p == 0x86 || *p == 0x87) { + IsShortName += (*p == 0x86) ? 1 : -1; + memmove(to, to + l, len - l + 1); // we also copy the terminating 0! + l = 0; } - bool singleByte; - const char *cs = getCharacterTable(from, len, &singleByte); - // FIXME Need to make this UTF-8 aware (different control codes). - // However, there's yet to be found a broadcaster that actually - // uses UTF-8 for the SI data... (kls 2007-06-10) - for (int i = 0; i < len; i++) { - if ( ((' ' <= *from) && (*from <= '~')) - || (*from == '\n') - || (0xA0 <= *from) - ) - { - *to++ = *from; - if (IsShortName) - *toShort++ = *from; + if (l && IsShortName) { + if (l < sizeShortVersion) { + for (int i = 0; i < l; i++) + *shortVersion++ = to[i]; + sizeShortVersion -= l; + } } - else if (*from == 0x8A) - *to++ = '\n'; - else if (*from == 0x86) - IsShortName++; - else if (*from == 0x87) - IsShortName--; - else if (*from == 0) - break; - from++; - if (to - buffer >= sizeBuffer - 1 || toShort - shortVersion >= sizeShortVersion - 1) - break; - } - *to = '\0'; - *toShort = '\0'; - if (!singleByte || !SystemCharacterTableIsSingleByte) { - char convBuffer[sizeBuffer]; - if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs)) - strncpy(buffer, convBuffer, strlen(convBuffer) + 1); - char convShortVersion[sizeShortVersion]; - if (convertCharacterTable(shortVersion, strlen(shortVersion), convShortVersion, sizeof(convShortVersion), cs)) - strncpy(shortVersion, convShortVersion, strlen(convShortVersion) + 1); + to += l; + len -= l; } + *shortVersion = '\0'; } Descriptor *Descriptor::getDescriptor(CharArray da, DescriptorTagDomain domain, bool returnUnimplemetedDescriptor) { -- cgit v1.2.3