summaryrefslogtreecommitdiff
path: root/epglv/src
diff options
context:
space:
mode:
Diffstat (limited to 'epglv/src')
-rw-r--r--epglv/src/epglv.c57
-rw-r--r--epglv/src/epglv.h52
-rw-r--r--epglv/src/epglvbase.c295
3 files changed, 404 insertions, 0 deletions
diff --git a/epglv/src/epglv.c b/epglv/src/epglv.c
new file mode 100644
index 0000000..80be809
--- /dev/null
+++ b/epglv/src/epglv.c
@@ -0,0 +1,57 @@
+/*
+
+EPG2VDR Levenshtein Distance UDF for MySQL
+Supports UTF-8 throught iconv.
+
+Copyright (C) 2013 Jörg Wendel
+
+*/
+
+#include "epglv.h"
+
+//***************************************************************************
+// EPGLV()
+//***************************************************************************
+
+my_bool epglv_init(UDF_INIT* init, UDF_ARGS* args, char* message)
+{
+ return base_epglv_init(init, args, message);
+}
+
+void epglv_deinit(UDF_INIT* init)
+{
+ base_epglv_deinit(init);
+}
+
+long long epglv(UDF_INIT* init, UDF_ARGS* args, char* is_null, char* error)
+{
+ long long len1 = 0, len2 = 0;
+ long long diff = base_epglv(init, args, is_null, error, &len1, &len2);
+
+ return diff;
+}
+
+//***************************************************************************
+// EPGLVR()
+//***************************************************************************
+
+my_bool epglvr_init(UDF_INIT* init, UDF_ARGS* args, char* message)
+{
+ return base_epglv_init(init, args, message);
+}
+
+void epglvr_deinit(UDF_INIT* init)
+{
+ base_epglv_deinit(init);
+}
+
+long long epglvr(UDF_INIT* init, UDF_ARGS* args, char* is_null, char* error)
+{
+ long long len1 = 0, len2 = 0;
+
+ long long diff = base_epglv(init, args, is_null, error, &len1, &len2);
+
+ long long len = len1 < len2 ? len2 : len1;
+
+ return (double)diff / ((double)len / 100.0);
+}
diff --git a/epglv/src/epglv.h b/epglv/src/epglv.h
new file mode 100644
index 0000000..39554b8
--- /dev/null
+++ b/epglv/src/epglv.h
@@ -0,0 +1,52 @@
+/*
+
+EPG2VDR Levenshtein Distance UDF for MySQL
+Supports UTF-8 throught iconv.
+
+Copyright (C) 2013 Jörg Wendel
+
+*/
+
+#ifndef __EPGLV_H
+#define __EPGLV_H
+
+#ifdef STANDARD
+# include <string.h>
+#else
+# include <my_global.h>
+# include <my_sys.h>
+#endif
+
+#include <mysql.h>
+#include <m_ctype.h>
+#include <m_string.h>
+
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <wchar.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <iconv.h>
+#include <errno.h>
+#include <stdint.h>
+#include <assert.h>
+
+#define LENGTH_MAX 1000
+
+#define debug_print(fmt, ...) \
+ do { if (DEBUG_MYSQL) fprintf(stderr, "%s():%d> " fmt "\n", \
+ __func__, __LINE__, __VA_ARGS__); fflush(stderr); } while (0)
+
+#define print(fmt, ...) \
+ do { fprintf(stderr, "%s():%d> " fmt "\n", \
+ __func__, __LINE__, __VA_ARGS__); fflush(stderr); } while (0)
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+#endif // __EPGLV_H
+
+my_bool base_epglv_init(UDF_INIT* init, UDF_ARGS* args, char* message);
+void base_epglv_deinit(UDF_INIT* init);
+long long base_epglv(UDF_INIT* init, UDF_ARGS* args, char* is_null,
+ char* error, long long* l1, long long* l2);
diff --git a/epglv/src/epglvbase.c b/epglv/src/epglvbase.c
new file mode 100644
index 0000000..845edcb
--- /dev/null
+++ b/epglv/src/epglvbase.c
@@ -0,0 +1,295 @@
+/*
+
+EPG2VDR Levenshtein Distance UDF for MySQL
+Supports UTF-8 i throught iconv.
+
+Copyright (C) 2013 Jörg Wendel
+
+*/
+
+#include "epglv.h"
+
+//***************************************************************************
+// Declarations
+//***************************************************************************
+
+struct workspace_t // structure to allocate memory in init and use it in core functions
+{
+ char* str1; // internal buffer to store 1st string
+ char* str2; // internal buffer to store 2nd string
+ int* row0; // round buffer for levenshtein_core function
+ int* row1; // round buffer for levenshtein_core function
+ int* row2; // round buffer for levenshtein_core function
+ mbstate_t* mbstate; // buffer for mbsnrtowcs
+ iconv_t ic; // buffer for iconv
+ char iconv_init; // flag indicating if iconv has been inited before
+};
+
+//***************************************************************************
+// Function Declarations
+//***************************************************************************
+
+int epglv_core(struct workspace_t *ws,
+ const char *str1, int len1,
+ const char *str2, int len2,
+ int w, int s, int a, int d, int limit);
+
+char* epglv_utf8toiso(const char *str_src, long long *len_src,
+ struct workspace_t * ws, char *str_dst, int limit);
+
+//***************************************************************************
+// The Work
+//***************************************************************************
+//***************************************************************************
+// init - check parameters and allocate memory for MySql
+//***************************************************************************
+
+my_bool base_epglv_init(UDF_INIT* init, UDF_ARGS* args, char* message)
+{
+ struct workspace_t *ws;
+
+ // make sure both arguments are ok
+
+ if (args->arg_count != 2 || args->arg_type[0] != STRING_RESULT || args->arg_type[1] != STRING_RESULT)
+ {
+ strncpy(message, "EPGLV() requires arguments (string, string)", 80);
+ return 1;
+ }
+
+ init->maybe_null = 0; // EPGLV() will not be returning null
+
+ // attempt to allocate memory in which to calculate distance
+
+ ws = (struct workspace_t*)malloc(sizeof(struct workspace_t));
+ ws->mbstate = (mbstate_t*)malloc(sizeof(mbstate_t));
+ ws->str1 = (char*)malloc(sizeof(char)*(LENGTH_MAX+2)); // max allocated for UTF-8 complex string
+ ws->str2 = (char*)malloc(sizeof(char)*(LENGTH_MAX+2));
+ ws->row0 = (int*)malloc(sizeof(int)*(LENGTH_MAX+2));
+ ws->row1 = (int*)malloc(sizeof(int)*(LENGTH_MAX+2));
+ ws->row2 = (int*)malloc(sizeof(int)*(LENGTH_MAX+2));
+ ws->iconv_init = 0;
+
+ if (!ws || !ws->mbstate || !ws->str1 || !ws->str2 || !ws->row0 || !ws->row1 || !ws->row2)
+ {
+ free(ws->row2); free(ws->row1); free(ws->row0);
+ free(ws->str2); free(ws->str1);
+ free(ws->mbstate); free(ws);
+ strncpy(message, "EPGLV() failed to allocate memory", 80);
+ return 1;
+ }
+
+ if (!setlocale(LC_CTYPE, "de_DE.UTF-8"))
+ {
+ free(ws->row2); free(ws->row1); free(ws->row0);
+ free(ws->str2); free(ws->str1);
+ free(ws->mbstate); free(ws);
+ strncpy(message, "EPGLV() failed to change locale", 80);
+ return 1;
+ }
+
+ init->ptr = (char *) ws;
+ debug_print("%s", "init successful");
+
+ return 0;
+}
+
+//***************************************************************************
+// deallocate memory, clean and close
+//***************************************************************************
+
+void base_epglv_deinit(UDF_INIT* init)
+{
+ if (init->ptr)
+ {
+ struct workspace_t* ws = (struct workspace_t*)init->ptr;
+
+ if (ws->iconv_init)
+ iconv_close(ws->ic);
+
+ free(ws->row2);
+ free(ws->row1);
+ free(ws->row0);
+ free(ws->str2);
+ free(ws->str1);
+ free(ws->mbstate);
+ free(ws);
+ }
+
+ debug_print("%s", "bye");
+}
+
+//***************************************************************************
+// check parameters and allocate memory for MySql
+//***************************************************************************
+
+long long base_epglv(UDF_INIT* init, UDF_ARGS* args, char* is_null, char* error,
+ long long* len1, long long* len2)
+{
+ const char* str1 = args->args[0];
+ const char* str2 = args->args[1];
+
+ char* iso_str1 = 0;
+ char* iso_str2 = 0;
+ struct workspace_t* ws = (struct workspace_t*)init->ptr; // get a pointer to memory previously allocated
+
+ *error = 0;
+ *len1 = (!str1) ? 0 : args->lengths[0];
+ *len2 = (!str2) ? 0 : args->lengths[1];
+ long long max = *len1 > *len2 ? *len1 : *len2;
+
+ if (max >= LENGTH_MAX)
+ {
+ print("size(%lld) was bigger than %d, aborting", max, LENGTH_MAX);
+ return -1;
+ }
+
+ if (!*len1 && !*len2)
+ return 0;
+
+ if (!*len1 || !*len2)
+ return max;
+
+ if ((iso_str1 = epglv_utf8toiso(str1, len1, ws, ws->str1, max)) == 0)
+ return -1;
+
+ if ((iso_str2 = epglv_utf8toiso(str2, len2, ws, ws->str2, max)) == 0)
+ return -1;
+
+ return epglv_core(
+ ws,
+ iso_str1, *len1,
+ iso_str2, *len2,
+ /* swap */ 1,
+ /* substitution */ 1,
+ /* insertion */ 1,
+ /* deletion */ 1,
+ /* limit */ max);
+}
+
+//***************************************************************************
+// core function
+//***************************************************************************
+
+int epglv_core(struct workspace_t* ws,
+ const char* str1, int len1,
+ const char* str2, int len2,
+ int w, int s, int a, int d, int limit)
+{
+ int* row0 = ws->row0;
+ int* row1 = ws->row1;
+ int* row2 = ws->row2;
+ int i, j;
+
+ for (j = 0; j <= len2; j++)
+ row1[j] = j * a;
+
+ for (i = 0; i < len1; i++)
+ {
+ int* dummy;
+
+ row2[0] = (i + 1) * d;
+
+ for (j = 0; j < len2; j++)
+ {
+ // substitution
+
+ row2[j + 1] = row1[j] + s * (str1[i] != str2[j]);
+
+ // swap
+
+ if (i > 0 && j > 0 &&
+ str1[i - 1] == str2[j] &&
+ str1[i] == str2[j - 1] &&
+ row2[j + 1] > row0[j - 1] + w)
+ {
+ row2[j + 1] = row0[j - 1] + w;
+ }
+
+ // deletion
+
+ if (row2[j + 1] > row1[j + 1] + d)
+ row2[j + 1] = row1[j + 1] + d;
+
+ // insertion
+
+ if (row2[j + 1] > row2[j] + a)
+ row2[j + 1] = row2[j] + a;
+ }
+
+ dummy = row0;
+ row0 = row1;
+ row1 = row2;
+ row2 = dummy;
+ }
+
+ debug_print("returning(%d)", row1[len2]);
+
+ return row1[len2];
+}
+
+//**************************************************************************
+// translates an UTF8 string to ISO with some error return codes
+//**************************************************************************
+
+char* epglv_utf8toiso(const char* str_src, long long* len_src,
+ struct workspace_t* ws, char* str_dst, int limit)
+{
+ mbstate_t* mbstate = ws->mbstate;
+ size_t len_mbsnrtowcs, len_ret = LENGTH_MAX, len_min = LENGTH_MAX;
+ char* ret = str_dst;
+ char* in_s = (char*)str_src;
+
+ memset((void*)mbstate, '\0', sizeof(mbstate_t));
+
+ if ((len_mbsnrtowcs = mbsnrtowcs(0, &str_src, *len_src, 0, mbstate)) == -1)
+ {
+ print("mbsnrtowcs failed, str_src(%s): '%m'", str_src);
+ strncpy(str_dst, str_src, len_min);
+ str_dst[len_min] = 0; str_dst[len_min + 1] = 0;
+ *len_src = len_min;
+ return str_dst;
+ }
+
+ len_min = MIN(len_mbsnrtowcs, limit);
+
+ if (len_mbsnrtowcs == *len_src)
+ {
+ strncpy(str_dst, str_src, len_min);
+ str_dst[len_min] = 0; str_dst[len_min + 1] = 0;
+ *len_src = len_min;
+ return str_dst;
+ }
+
+ if (!ws->iconv_init)
+ {
+ if ((ws->ic = iconv_open("ISO8859-15", "UTF-8")) == (iconv_t)-1)
+ {
+ print("%s", "failed to initialize iconv '%m'");
+ return 0;
+ }
+
+ ws->iconv_init = 1;
+ }
+
+ if (iconv(ws->ic, &in_s, (size_t*)len_src, &ret, &len_ret) == -1)
+ {
+ if (errno != E2BIG)
+ {
+ print("in_s '%s' at '%s'; len_src (%lld) len_ret (%zu) '%m'", str_src, in_s, *len_src, len_ret);
+ strncpy(str_dst, str_src, len_min);
+ str_dst[len_min] = 0; str_dst[len_min + 1] = 0;
+ *len_src = len_min;
+ return str_dst;
+ }
+
+ print("inside E2BIG len_mbsnrtowcs(%zu) len_src(%lld)", len_mbsnrtowcs, *len_src);
+ len_mbsnrtowcs = len_min;
+ }
+
+ *len_src = len_min; // adjust converted length
+ str_dst[len_min] = 0; str_dst[len_min + 1] = 0;
+
+ iconv(ws->ic, 0, 0, 0, 0);
+
+ return str_dst;
+}