From cee834154508f1e6ff8f7f2d2f79ec840545ac0f Mon Sep 17 00:00:00 2001
From: Klaus Schmidinger <vdr@tvdr.de>
Date: Tue, 21 Aug 2012 08:23:13 +0200
Subject: Fixed handling control characters in SI data in case of UTF-8 encoded
 strings

---
 CONTRIBUTORS |   3 ++
 HISTORY      |   5 ++-
 libsi/si.c   | 134 ++++++++++++++++++++++++++++++-----------------------------
 3 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 6512bfcb..608251b2 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -2917,3 +2917,6 @@ Dirk Heiser <dirk-vdr@gmx.de>
 
 Ludi Kaleni <ludi113@hotmail.com>
  for suggesting to add the source character to channel names whenever they are displayed
+
+Mehdi Karamnejad <mehdi_karamnejad@sfu.ca>
+ for reporting a problem with garbled UTF-8 EPG data and helping to debug it
diff --git a/HISTORY b/HISTORY
index 3b272931..70463b7d 100644
--- a/HISTORY
+++ b/HISTORY
@@ -7191,6 +7191,9 @@ Video Disk Recorder Revision History
   turn on adding the source character to channel names whenever they are displayed
   (suggested by Ludi Kaleni).
 
-2012-07-15: Version 1.7.30
+2012-08-21: Version 1.7.30
 
 - Fixed sorting recordings in the top level video directory.
+- Fixed handling control characters in SI data in case of UTF-8 encoded strings
+  (thanks to Mehdi Karamnejad for reporting a problem with garbled UTF-8 EPG data
+  and helping to debug it).
diff --git a/libsi/si.c b/libsi/si.c
index 4db917be..cd889bcd 100644
--- a/libsi/si.c
+++ b/libsi/si.c
@@ -6,7 +6,7 @@
  *   the Free Software Foundation; either version 2 of the License, or     *
  *   (at your option) any later version.                                   *
  *                                                                         *
- *   $Id: si.c 2.6 2011/12/10 15:47:15 kls Exp $
+ *   $Id: si.c 2.7 2012/08/21 08:10:00 kls Exp $
  *                                                                         *
  ***************************************************************************/
 
@@ -405,6 +405,21 @@ bool convertCharacterTable(const char *from, size_t fromLength, char *to, size_t
   return false;
 }
 
+// A similar version is used in VDR/tools.c:
+static int Utf8CharLen(const char *s)
+{
+  if (SystemCharacterTableIsSingleByte)
+     return 1;
+#define MT(s, m, v) ((*(s) & (m)) == (v)) // Mask Test
+  if (MT(s, 0xE0, 0xC0) && MT(s + 1, 0xC0, 0x80))
+     return 2;
+  if (MT(s, 0xF0, 0xE0) && MT(s + 1, 0xC0, 0x80) && MT(s + 2, 0xC0, 0x80))
+     return 3;
+  if (MT(s, 0xF8, 0xF0) && MT(s + 1, 0xC0, 0x80) && MT(s + 2, 0xC0, 0x80) && MT(s + 3, 0xC0, 0x80))
+     return 4;
+  return 1;
+}
+
 // originally from libdtv, Copyright Rolf Hakenes <hakenes@hippomi.de>
 void String::decodeText(char *buffer, int size) {
    const unsigned char *from=data.getData(0);
@@ -413,82 +428,71 @@ void String::decodeText(char *buffer, int size) {
    if (len <= 0) {
       *to = '\0';
       return;
-      }
+   }
    bool singleByte;
    const char *cs = getCharacterTable(from, len, &singleByte);
-   // FIXME Need to make this UTF-8 aware (different control codes).
-   // However, there's yet to be found a broadcaster that actually
-   // uses UTF-8 for the SI data... (kls 2007-06-10)
-   for (int i = 0; i < len; i++) {
-      if (*from == 0)
-         break;
-      if (    ((' ' <= *from) && (*from <= '~'))
-           || (*from == '\n')
-           || (0xA0 <= *from)
-         )
-         *to++ = *from;
-      else if (*from == 0x8A)
-         *to++ = '\n';
-      from++;
-      if (to - buffer >= size - 1)
-         break;
+   if (singleByte && SystemCharacterTableIsSingleByte || !convertCharacterTable((const char *)from, len, to, size, cs)) {
+      if (len >= size)
+         len = size - 1;
+      strncpy(to, (const char *)from, len);
+      to[len] = 0;
    }
-   *to = '\0';
-   if (!singleByte || !SystemCharacterTableIsSingleByte) {
-      char convBuffer[size];
-      if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
-         strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
+   else
+      len = strlen(to); // might have changed
+   // Handle control codes:
+   while (len > 0) {
+      int l = Utf8CharLen(to);
+      if (l <= 2) {
+         unsigned char *p = (unsigned char *)to;
+         if (l == 2 && *p == 0xC2) // UTF-8 sequence
+            p++;
+         bool Move = true;
+         switch (*p) {
+           case 0x8A: *to = '\n'; break;
+           case 0xA0: *to = ' ';  break;
+           default:   Move = false;
+         }
+         if (l == 2 && Move) {
+            memmove(p, p + 1, len - 1); // we also copy the terminating 0!
+            l = 1;
+         }
+      }
+      to += l;
+      len -= l;
    }
 }
 
 void String::decodeText(char *buffer, char *shortVersion, int sizeBuffer, int sizeShortVersion) {
-   const unsigned char *from=data.getData(0);
+   decodeText(buffer, sizeBuffer);
+   if (!*buffer) {
+      *shortVersion = '\0';
+      return;
+   }
+   // Handle control codes:
    char *to=buffer;
-   char *toShort=shortVersion;
+   int len=strlen(to);
    int IsShortName=0;
-   int len=getLength();
-   if (len <= 0) {
-      *to = '\0';
-      *toShort = '\0';
-      return;
+   while (len > 0) {
+      int l = Utf8CharLen(to);
+      unsigned char *p = (unsigned char *)to;
+      if (l == 2 && *p == 0xC2) // UTF-8 sequence
+         p++;
+      if (*p == 0x86 || *p == 0x87) {
+         IsShortName += (*p == 0x86) ? 1 : -1;
+         memmove(to, to + l, len - l + 1); // we also copy the terminating 0!
+         l = 0;
       }
-   bool singleByte;
-   const char *cs = getCharacterTable(from, len, &singleByte);
-   // FIXME Need to make this UTF-8 aware (different control codes).
-   // However, there's yet to be found a broadcaster that actually
-   // uses UTF-8 for the SI data... (kls 2007-06-10)
-   for (int i = 0; i < len; i++) {
-      if (    ((' ' <= *from) && (*from <= '~'))
-           || (*from == '\n')
-           || (0xA0 <= *from)
-         )
-      {
-         *to++ = *from;
-         if (IsShortName)
-            *toShort++ = *from;
+      if (l && IsShortName) {
+         if (l < sizeShortVersion) {
+            for (int i = 0; i < l; i++)
+                *shortVersion++ = to[i];
+            sizeShortVersion -= l;
+         }
       }
-      else if (*from == 0x8A)
-         *to++ = '\n';
-      else if (*from == 0x86)
-         IsShortName++;
-      else if (*from == 0x87)
-         IsShortName--;
-      else if (*from == 0)
-         break;
-      from++;
-      if (to - buffer >= sizeBuffer - 1 || toShort - shortVersion >= sizeShortVersion - 1)
-         break;
-   }
-   *to = '\0';
-   *toShort = '\0';
-   if (!singleByte || !SystemCharacterTableIsSingleByte) {
-      char convBuffer[sizeBuffer];
-      if (convertCharacterTable(buffer, strlen(buffer), convBuffer, sizeof(convBuffer), cs))
-         strncpy(buffer, convBuffer, strlen(convBuffer) + 1);
-      char convShortVersion[sizeShortVersion];
-      if (convertCharacterTable(shortVersion, strlen(shortVersion), convShortVersion, sizeof(convShortVersion), cs))
-         strncpy(shortVersion, convShortVersion, strlen(convShortVersion) + 1);
+      to += l;
+      len -= l;
    }
+   *shortVersion = '\0';
 }
 
 Descriptor *Descriptor::getDescriptor(CharArray da, DescriptorTagDomain domain, bool returnUnimplemetedDescriptor) {
-- 
cgit v1.2.3